In [137]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix

In [138]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [139]:
X_train = train.drop('default', axis=1)
y_train = train['default'].values
X_test = test.drop('default', axis=1)
y_test = test['default'].values

In [140]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [141]:
LGC = LogisticRegression(penalty='l2', C=1.0)

In [142]:
LGC.fit(X_train_scaled, y_train)

In [144]:
DT = DecisionTreeClassifier()

In [145]:
DT.fit(X_train_scaled, y_train)

In [146]:
RFC = RandomForestClassifier()
RFC.fit(X_train_scaled, y_train)

In [147]:
ET = ExtraTreeClassifier()
ET.fit(X_train_scaled, y_train)

In [148]:
GB = GradientBoostingClassifier()
GB.fit(X_train_scaled, y_train)

In [149]:
XGB = XGBClassifier()
XGB.fit(X_train_scaled, y_train)

In [150]:
X_test_scaled = scaler.transform(X_test)

In [151]:
def model_scores(X_train, y_train, X_test, y_test):
    models = {'Logistic Regression': LGC, 'Decision Tree': DT, 'Extra Tree': ET,
              'Random Forest': RFC, 'Gradient Boosting': GB, 'ExtremGradient Bossting': XGB, }
    for name, model in models.items():
        print(f"{name}:\n Training Score: {model.score(X_train,y_train)}\nTest Score: {model.score(X_test,y_test)}")

In [152]:
model_scores(X_train_scaled, y_train, X_test_scaled, y_test)

Logistic Regression:
 Training Score: 0.722349638859412
Test Score: 0.7251018289274019
Decision Tree:
 Training Score: 0.9992811419573477
Test Score: 0.7417937864387828
Extra Tree:
 Training Score: 0.9992811419573477
Test Score: 0.7291749860234805
Random Forest:
 Training Score: 0.9992469106219833
Test Score: 0.8330005590607779
Gradient Boosting:
 Training Score: 0.7854379899359875
Test Score: 0.7808481750658893
ExtremGradient Bossting:
 Training Score: 0.9038784102967857
Test Score: 0.8208609535979554


In [153]:
from sklearn.model_selection import cross_val_score

In [154]:
final_data = pd.read_csv("../data/final_data.csv")

In [155]:
X = final_data.drop('default', axis=1)
y = final_data['default'].values

In [156]:
scaler1 = StandardScaler()

In [157]:
X_scale = scaler1.fit_transform(X)

In [158]:
def validation(X, y):
    scores = {}

    model_list = {'Logistic Regression': LGC, 'Decision Tree': DT, 'Extra Tree': ET,












                  'Random Forest': RFC, 'Gradient Boosting': GB, 'ExtremGradient Bossting': XGB}

    for name, model in model_list.items():
        scores[name] = cross_val_score(model, X, y)
    return scores

In [159]:
scores = validation(X_scale, y)

In [160]:
for key, value in scores.items():
    total = 0
    for i in value:
        total += i
    avg = total / 5
    print(f"{key}: avg_score: {avg}")

Logistic Regression: avg_score: 0.7188629195926015
Decision Tree: avg_score: 0.7401648499910901
Extra Tree: avg_score: 0.7311311964553546
Random Forest: avg_score: 0.8252754713039845
Gradient Boosting: avg_score: 0.7674327029623969
ExtremGradient Bossting: avg_score: 0.801745861445748


In [161]:
from sklearn.model_selection import GridSearchCV

In [162]:
param_grid = {
    'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

clf = GridSearchCV(LogisticRegression(solver='liblinear'),
                   param_grid, cv=5, return_train_score=False)
clf.fit(X_train_scaled, y_train)

In [163]:
clf.best_score_, clf.best_params_

(0.7228628221880037, {'C': 100, 'penalty': 'l1'})

In [164]:
O_LGC = clf.best_estimator_

##### Decision Tree


In [165]:
param_grid = param_grid = {
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10]
}

clf1 = GridSearchCV(DecisionTreeClassifier(), param_grid,
                    cv=5, return_train_score=False)
clf1.fit(X_train_scaled, y_train)

In [166]:
print(clf1.best_score_, clf1.best_params_)
O_DT = clf1.best_estimator_

0.7440865666557472 {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}


#### Random Forest Classifier


In [167]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 2, 5]
}
clf2 = GridSearchCV(RandomForestClassifier(), param_grid,
                    cv=5, return_train_score=False)
clf2.fit(X_train_scaled, y_train)

KeyboardInterrupt: 

In [None]:
print(clf2.best_score_, clf2.best_params_)
O_RF = clf2.best_estimator_