In [1]:
import modelos
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
def transform_columns(df):
    df = df.drop(["Description"], axis=1)
    df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
    df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
    df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
    df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
    df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
    df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
    df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
    df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
    df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
    df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
    df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
    df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
    df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
    return df


def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = None
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

In [3]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')



In [None]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")
X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                      y,
                                                      test_size=0.3,
                                                      random_state=113)

In [None]:
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [4]:
m=2
sm=1
dic = modelos.modelo_familia()
dic[list(dic.keys())[m]][sm]

'DecisionTreeClassifier'

In [7]:
clf,params = modelos.get_modelo(113,list(dic.keys())[m],sm)

In [9]:
params1 = {'criterion': ('gini', 'entropy'),
           'splitter': ('best', 'random'),
           'max_depth': (None, 10, 30, 50, 100),
           'min_samples_split': (2, 10, 50, 100, 200),
           'min_samples_leaf': (1, 2, 5, 10, 50),
           'max_features': (None, 'log2', 'sqrt'),
           'class_weight': (None, 'balanced')}

In [11]:
params2 = {'criterion':('gini', 'entropy'),
           'min_samples_leaf':(1, 2, 3, 5, 10),
           'min_samples_split':(50,100,200)
          }
           
params3 = {'criterion':('gini', 'entropy'), 
           'min_samples_leaf':(1, 2, 5),
           'min_samples_split':(90,100,120,200),
           'max_depth': (5,10,15,20),
           'class_weight':('balanced',None)
          }


params3 = {'criterion':('gini', 'entropy'), 
           'min_samples_leaf':(4,5,6,7,8,9,10,11),
           'min_samples_split':(180,190,200,210,220),
           'max_depth': (5,6,7,8,9,10,11,12)
          }

In [12]:
model = GridSearchCV(clf, params2, scoring='accuracy', cv=5, iid=False)
model.fit(X_train.drop(["PID"], axis=1), y_train)
best_model = model.best_estimator_

print('Best Model Accuracy: ', model.best_score_)
print(best_model)

results = results.append({'clf': best_model, 'best_acc': model.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Model Accuracy:  0.3680300203595275
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=200,
            min_weight_fraction_leaf=0.0, presort=False, random_state=113,
            splitter='best')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=200,
            min_weight_fraction_leaf=0.0, presort=False, random_state=113,
            splitter='best')


In [10]:
results

Unnamed: 0,clf,best_acc
0,"DecisionTreeClassifier(class_weight=None, crit...",0.369246
