In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

In [10]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [11]:
train.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [12]:
train.target.unique()

array(['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9'], dtype=object)

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(train['target'])
test_id = test['id']
newtrain = train.drop(['target','id'],axis = 1)
newtest = test.drop(['id'],axis = 1)

In [55]:
X_train,X_test,y_train,y_test = train_test_split(newtrain,y,test_size = 0.2,random_state = 36)

Now lets look at each model and see which one performs better

In [36]:
logloss = []
models = ['RandomForestClassifier','DecisionTreeClassifier','LogisticRegression']
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
sol = rfc.predict_proba(X_test)
logloss.append(log_loss(y_test, sol, eps=1e-15, normalize=True))
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
sol = dtc.predict_proba(X_test)
logloss.append(log_loss(y_test, sol, eps=1e-15, normalize=True))
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
sol = lr.predict_proba(X_test)
logloss.append(log_loss(y_test, sol, eps=1e-15, normalize=True))
result = pd.DataFrame({'models':models,'log_loss_error':logloss})



In [37]:
result

Unnamed: 0,models,log_loss_error
0,RandomForestClassifier,1.49293
1,DecisionTreeClassifier,10.032878
2,LogisticRegression,0.670575


In [42]:
from sklearn.model_selection import GridSearchCV
lr_1 = LogisticRegression()
params = {
    'solver':['newton-cg','lbfgs'],
    'C' : [0.5,1.0]
}
gr = GridSearchCV(lr_1,params)
gr.fit(X_train,y_train)





GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.5, 1.0], 'solver': ['newton-cg', 'lbfgs']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [45]:
lr_final = LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False)

In [46]:
lr_final.fit(X_train,y_train)
lr_final_p = lr_final.predict_proba(X_test)
print(log_loss(y_test, lr_final_p, eps=1e-15, normalize=True))



0.6705752484316586


In [47]:
rf = RandomForestClassifier(n_jobs = -1)
parms = {
    'n_estimators':[100,200,300],
    'criterion':['gini','entropy']
}
ccv = GridSearchCV(rf,parms)
ccv.fit(X_train,y_train)
ccv.best_params_



{'criterion': 'gini', 'n_estimators': 300}

In [48]:
rf = RandomForestClassifier(criterion = 'gini',n_estimators = 300,n_jobs= -1)
parms = {
    
    'max_features':['sqrt','log2'],
    'min_weight_fraction_leaf' : [0.0,0.1],
    'max_leaf_nodes' : [40,60]
}
ccv = GridSearchCV(rf,parms)
ccv.fit(X_train,y_train)
ccv.best_params_



{'max_features': 'sqrt', 'max_leaf_nodes': 60, 'min_weight_fraction_leaf': 0.0}

In [53]:
rfc_final = RandomForestClassifier(criterion = 'gini',n_estimators = 300,n_jobs= -1,max_features = 'sqrt',max_leaf_nodes = 60,min_weight_fraction_leaf = 0.0)
rfc_final.fit(X_train,y_train)
y_predict = rfc_final.predict_proba(X_test)
print(log_loss(y_test, y_predict, eps=1e-15, normalize=True))

0.9413632653408744


In [56]:
from sklearn.calibration import CalibratedClassifierCV
clf = RandomForestClassifier(criterion = 'gini',n_estimators = 300,n_jobs= -1,max_features = 'sqrt',max_leaf_nodes = 60,min_weight_fraction_leaf = 0.0)
cclf = CalibratedClassifierCV(clf, method='isotonic', cv=5)
cclf.fit(X_train,y_train)
y_val = cclf.predict_proba(X_test)
print(log_loss(y_test, y_val, eps=1e-15, normalize=True))

0.7071685888565171
