# Milestone 2

Fix data preprocessing and observe effect on training with various data patterns.

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from matplotlib import pyplot as plt
import seaborn as sns
import re

In [4]:
processed_data=pd.read_csv("puppymerged_num.csv")

In [5]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

OHE_columns = {}

# Encode Values into a new column
def column_encode(df, col_name):
    lb_make = LabelEncoder()
    df[col_name + "_encoded"] = lb_make.fit_transform(df[col_name])

# One Hot Encoder
def one_hot_encode(df, col_name):
    lb_style = LabelBinarizer()
    lb_results = lb_style.fit_transform(df[col_name])
    new_df = pd.DataFrame(lb_results, columns=lb_style.classes_)
    
    # Add new_df to df
    print("New columns added:")
    for x in new_df:
        new_col_name = str(col_name) + "-" + str(x)
        if isinstance(x, np.float64):
            new_col_name = str(col_name) + "-" + str(int(x))
            
        df[new_col_name] = new_df[x]
        print(" * " + new_col_name)
        
        if col_name not in OHE_columns:
            OHE_columns[col_name] = []
        OHE_columns[col_name].append(new_col_name)

In [7]:
columns_to_OHE = [
   "ExceriseAmount", "NailCutting", "StaysOnCommand","AttendsHomeSwitches","BehavesWellClass","ChasingAnimals", "JumpOnPeople", "LeftUnattended", "BarksExcessively", "PlaybitePeople", "EliminatesOnRoute", "RaidsGarbage", "CanGivePills", "Color", "OnFurniture","EarCleaning", "CounterSurfingJumpOnDoors", "WalksWellOnLeash", "ComeOffLeash", "NoInappropriateChewing", "AttendsClasses", "Stairs", "StealsFood", "KnowCommandGetBusy", "QuietInCrate", "Housemanners", "ComeOnLeash", "EnergyLevel", "NoiseFear", "FriendlyWAnimals", "RespondsToCommandKennel", "Health", "TrafficFear", "SitsOnCommand", "GoodWKids", "StoolFirm", "DownOnCommand", "EliminationInHouse", "GoodWStrangers", "EliminationInCrate", "Breed"
]
for col in columns_to_OHE:
    column_encode(processed_data, col)
    one_hot_encode(processed_data, col)

New columns added:
 * ExceriseAmount-0
 * ExceriseAmount-1
 * ExceriseAmount-2
 * ExceriseAmount-3
 * ExceriseAmount-4
 * ExceriseAmount-5
 * ExceriseAmount-6
New columns added:
 * NailCutting-0
 * NailCutting-1
 * NailCutting-2
 * NailCutting-3
 * NailCutting-4
 * NailCutting-5
New columns added:
 * StaysOnCommand-0
 * StaysOnCommand-1
 * StaysOnCommand-2
 * StaysOnCommand-3
 * StaysOnCommand-4
 * StaysOnCommand-5
New columns added:
 * AttendsHomeSwitches-0
 * AttendsHomeSwitches-1
 * AttendsHomeSwitches-2
 * AttendsHomeSwitches-3
 * AttendsHomeSwitches-4
 * AttendsHomeSwitches-5
New columns added:
 * BehavesWellClass-0
 * BehavesWellClass-1
 * BehavesWellClass-2
 * BehavesWellClass-3
 * BehavesWellClass-4
 * BehavesWellClass-5
New columns added:
 * ChasingAnimals-0
 * ChasingAnimals-1
 * ChasingAnimals-2
 * ChasingAnimals-3
 * ChasingAnimals-4
 * ChasingAnimals-5
New columns added:
 * JumpOnPeople-0
 * JumpOnPeople-1
 * JumpOnPeople-2
 * JumpOnPeople-3
 * JumpOnPeople-4
 * JumpOnPeop

In [8]:
desired_columns = [
    "dog_SubStatusCode","ExceriseAmount", "NailCutting", "StaysOnCommand","AttendsHomeSwitches","BehavesWellClass","ChasingAnimals", "JumpOnPeople", "LeftUnattended", "BarksExcessively", "PlaybitePeople", "EliminatesOnRoute", "RaidsGarbage", "CanGivePills", "Color", "OnFurniture","EarCleaning", "CounterSurfingJumpOnDoors", "WalksWellOnLeash", "ComeOffLeash", "NoInappropriateChewing", "AttendsClasses", "Stairs", "StealsFood", "KnowCommandGetBusy", "QuietInCrate", "Housemanners", "ComeOnLeash", "EnergyLevel", "NoiseFear", "FriendlyWAnimals", "RespondsToCommandKennel", "Health", "TrafficFear", "SitsOnCommand", "GoodWKids", "StoolFirm", "DownOnCommand", "Sex", "EliminationInHouse", "GoodWStrangers", "EliminationInCrate", "Breed"
]

desired_columns_processed = []

# Only keep OHE columns, not the original columns
for col in desired_columns:
    if col in OHE_columns:
        for OHE_col in OHE_columns[col]:
            desired_columns_processed.append(OHE_col)
    else:
        desired_columns_processed.append(col)
        
# Write all columns to 

processed_data = processed_data[desired_columns_processed]

In [9]:
# Column Mappings for Feature Selections
feature_set_columns = [
    ["dog_SubStatusCode","ExceriseAmount", "NailCutting", "StaysOnCommand","AttendsHomeSwitches","BehavesWellClass","ChasingAnimals", "JumpOnPeople", "LeftUnattended", "BarksExcessively", "PlaybitePeople", "EliminatesOnRoute", "RaidsGarbage", "CanGivePills", "Color", "OnFurniture","EarCleaning", "CounterSurfingJumpOnDoors", "WalksWellOnLeash", "ComeOffLeash", "NoInappropriateChewing", "AttendsClasses", "Stairs", "StealsFood", "KnowCommandGetBusy", "QuietInCrate", "Housemanners", "ComeOnLeash", "EnergyLevel", "NoiseFear", "FriendlyWAnimals", "RespondsToCommandKennel", "Health", "TrafficFear", "SitsOnCommand", "GoodWKids", "StoolFirm", "DownOnCommand", "Sex", "EliminationInHouse", "GoodWStrangers", "EliminationInCrate", "Breed"],
    ["dog_SubStatusCode","ExceriseAmount", "NailCutting", "StaysOnCommand","AttendsHomeSwitches","BehavesWellClass","ChasingAnimals", "JumpOnPeople", "LeftUnattended", "BarksExcessively", "PlaybitePeople", "EliminatesOnRoute", "RaidsGarbage", "CanGivePills", "Color", "OnFurniture","EarCleaning", "CounterSurfingJumpOnDoors", "WalksWellOnLeash", "ComeOffLeash", "NoInappropriateChewing", "AttendsClasses", "Stairs", "StealsFood", "KnowCommandGetBusy", "QuietInCrate", "Housemanners", "ComeOnLeash", "EnergyLevel", "NoiseFear", "FriendlyWAnimals", "RespondsToCommandKennel", "Health", "TrafficFear", "SitsOnCommand", "GoodWKids", "StoolFirm", "DownOnCommand", "Sex", "EliminationInHouse"],
    ["dog_SubStatusCode","ExceriseAmount", "NailCutting", "StaysOnCommand","AttendsHomeSwitches","BehavesWellClass","ChasingAnimals", "JumpOnPeople", "LeftUnattended", "BarksExcessively", "PlaybitePeople", "EliminatesOnRoute", "RaidsGarbage", "CanGivePills", "Color", "OnFurniture","EarCleaning", "CounterSurfingJumpOnDoors", "WalksWellOnLeash", "ComeOffLeash", "NoInappropriateChewing", "AttendsClasses", "Stairs", "StealsFood", "KnowCommandGetBusy", "QuietInCrate", "Housemanners", "ComeOnLeash", "EnergyLevel", "NoiseFear", "FriendlyWAnimals", "RespondsToCommandKennel", "Health", "TrafficFear", "SitsOnCommand"],
    ["dog_SubStatusCode","ExceriseAmount", "NailCutting", "StaysOnCommand","AttendsHomeSwitches","BehavesWellClass","ChasingAnimals", "JumpOnPeople", "LeftUnattended", "BarksExcessively", "PlaybitePeople", "EliminatesOnRoute", "RaidsGarbage", "CanGivePills", "Color", "OnFurniture","EarCleaning", "CounterSurfingJumpOnDoors", "WalksWellOnLeash", "ComeOffLeash", "NoInappropriateChewing", "AttendsClasses", "Stairs", "StealsFood", "KnowCommandGetBusy", "QuietInCrate", "Housemanners", "ComeOnLeash", "EnergyLevel", "NoiseFear"],
    ["dog_SubStatusCode","ExceriseAmount", "NailCutting", "StaysOnCommand","AttendsHomeSwitches","BehavesWellClass","ChasingAnimals", "JumpOnPeople", "LeftUnattended", "BarksExcessively", "PlaybitePeople", "EliminatesOnRoute", "RaidsGarbage", "CanGivePills", "Color", "OnFurniture","EarCleaning", "CounterSurfingJumpOnDoors", "WalksWellOnLeash", "ComeOffLeash", "NoInappropriateChewing", "AttendsClasses", "Stairs", "StealsFood", "KnowCommandGetBusy"],
    ["dog_SubStatusCode","ExceriseAmount", "NailCutting", "StaysOnCommand","AttendsHomeSwitches","BehavesWellClass","ChasingAnimals", "JumpOnPeople", "LeftUnattended", "BarksExcessively", "PlaybitePeople", "EliminatesOnRoute", "RaidsGarbage", "CanGivePills", "Color", "OnFurniture","EarCleaning", "CounterSurfingJumpOnDoors", "WalksWellOnLeash", "ComeOffLeash"]
]

mapped_feature_set_columns = []

for some_set in feature_set_columns:
    new_set = []
    for col in some_set:
        if col in OHE_columns:
            for OHE_col in OHE_columns[col]:
                new_set.append(OHE_col)
        else:
            new_set.append(col)
    mapped_feature_set_columns.append(new_set)
    
for some_set in mapped_feature_set_columns:
    set_str = ""
    for col in some_set:
        set_str += "\"" + col + "\", "
    
    print(set_str)
    print()

"dog_SubStatusCode", "ExceriseAmount-0", "ExceriseAmount-1", "ExceriseAmount-2", "ExceriseAmount-3", "ExceriseAmount-4", "ExceriseAmount-5", "ExceriseAmount-6", "ExceriseAmount-0", "ExceriseAmount-1", "ExceriseAmount-2", "ExceriseAmount-3", "ExceriseAmount-4", "ExceriseAmount-5", "ExceriseAmount-6", "NailCutting-0", "NailCutting-1", "NailCutting-2", "NailCutting-3", "NailCutting-4", "NailCutting-5", "NailCutting-0", "NailCutting-1", "NailCutting-2", "NailCutting-3", "NailCutting-4", "NailCutting-5", "StaysOnCommand-0", "StaysOnCommand-1", "StaysOnCommand-2", "StaysOnCommand-3", "StaysOnCommand-4", "StaysOnCommand-5", "StaysOnCommand-0", "StaysOnCommand-1", "StaysOnCommand-2", "StaysOnCommand-3", "StaysOnCommand-4", "StaysOnCommand-5", "AttendsHomeSwitches-0", "AttendsHomeSwitches-1", "AttendsHomeSwitches-2", "AttendsHomeSwitches-3", "AttendsHomeSwitches-4", "AttendsHomeSwitches-5", "AttendsHomeSwitches-0", "AttendsHomeSwitches-1", "AttendsHomeSwitches-2", "AttendsHomeSwitches-3", "Atte

In [10]:
# Output CSVs for each Feature Set
set_names = ['42', '39', '34', '29', '24', '19']

for i in range(len(mapped_feature_set_columns)):
    feature_set = processed_data[mapped_feature_set_columns[i]]
    feature_set.to_csv('puppy_info_feature_set_' + set_names[i] + '.csv', index=False)