In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 50)
import random as rnd

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_df = pd.read_csv('/Users/yashmadane/Downloads/DS_test/training_grover.csv',delimiter=';')
validation_df = pd.read_csv('/Users/yashmadane/Downloads/DS_test/validation_grover.csv',delimiter=';')
combine = [train_df, validation_df]

In [None]:
combine_df = train_df.append(validation_df)

### The columns are masked or either reduced using PCA

In [None]:
print(combine_df.columns.values)

['Unnamed: 0' 'x.0' 'x.1' 'x.2' 'x.3' 'x.4' 'x.5' 'x.6' 'x.7' 'x.8' 'x.9'
 'x.10' 'x.11' 'x.12' 'x.13' 'x.14' 'x.20' 'x.17' 'x.18' 'x.19' 'x.16' 'y']


### We can see all datatypes are object and we see some numerical , some objects and some mix datatype columns

In [None]:
combine_df['x.16'].fillna(0)

0      f
1      f
2      f
3      f
4      f
      ..
485    f
486    t
487    f
488    f
489    f
Name: x.16, Length: 3161, dtype: object

### We observe ? is present in a lot of columns in the dataframe. We replace ? with Null values for easy handling

In [None]:
combine_df.replace('?', np.NaN,inplace=True)

In [None]:
combine_df[combine_df['x.18'].isna()]



Unnamed: 0.1,Unnamed: 0,x.0,x.1,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10,x.11,x.12,x.13,x.14,x.20,x.17,x.18,x.19,x.16,y


### Convert to appropriate dataTypes

- Some columns in the dataframe are of mixed datypes. 
- Using pd.to_numeric with argument errors='coerce' to convert the strings to Nan then finally fillna these with the strings in the original columns. Now we can perform operations on these columns
- Some of these columns are x.2,x.7,x.10,x.13,x.14,x.19 -  We can find these out by distribution but since dataset is relatively small, I am eyeballing the set
- Some columns have numeric values in comma seperated object format -converting to numerical format

In [None]:
#train_df['x.14'] = pd.to_numeric(train_df['x.14'], errors='coerce').fillna(train_df['x.14'])
combine_df['x.2'] = pd.to_numeric(combine_df['x.2'], errors='coerce').fillna(combine_df['x.2'])
combine_df['x.7'] = pd.to_numeric(combine_df['x.7'], errors='coerce').fillna(combine_df['x.7'])
combine_df['x.10'] = pd.to_numeric(combine_df['x.10'], errors='coerce').fillna(combine_df['x.10'])
combine_df['x.13'] = pd.to_numeric(combine_df['x.13'], errors='coerce').fillna(combine_df['x.13'])
combine_df['x.14'] = pd.to_numeric(combine_df['x.14'], errors='coerce').fillna(combine_df['x.14'])
combine_df['x.19'] = pd.to_numeric(combine_df['x.19'], errors='coerce').fillna(combine_df['x.19'])

#### Converting comma seperated to int

In [None]:
combine_df['x.1'] = combine_df['x.1'].str.replace(',', '').astype(float)
#df['col3'] = df.col3.str.split(',').str.join('').astype(int)
combine_df['x.17'] = combine_df['x.17'].str.replace(',', '').astype(float)
combine_df['x.18'] = combine_df['x.18'].str.replace(',', '').astype(float)

In [None]:
combine_df.describe(include=['O'])

Unnamed: 0,x.0,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10,x.11,x.12,x.13,x.14,x.20,x.19,x.16,y
count,3134,3161,3131,3131,3127,3127,3161,3161,3161,3161,3161,3161,3097,3161,981,3097,2493,3161
unique,2,215,3,3,14,9,132,4,2,23,2,3,170,240,2,170,2,2
top,b,f,u,g,q,v,f,t,t,f,f,g,f,f,f,f,f,good
freq,2165,89,2525,2525,581,1704,202,2425,1834,1327,1662,2919,742,1256,561,742,1991,2552


### For mixed datatype columns x.2, x.7 we can see the distribution of categorical variables is very low as compared to the other continous values so we can replace it with the average of the column

### For variables like x10,x13,x14,x19 we can create a seperate column for these categorical variables and test with different models but for this case study - replacing it by mean of the column

In [None]:
combine_df['x.2'] = pd.to_numeric(combine_df['x.2'], errors='coerce')
combine_df['x.7'] = pd.to_numeric(combine_df['x.7'], errors='coerce')
combine_df['x.10'] =  pd.to_numeric(combine_df['x.10'], errors='coerce')
combine_df['x.13'] =  pd.to_numeric(combine_df['x.13'], errors='coerce')
combine_df['x.14'] =  pd.to_numeric(combine_df['x.14'], errors='coerce')
combine_df['x.19'] =  pd.to_numeric(combine_df['x.19'], errors='coerce')


In [None]:
combine_df['x.2'].fillna(value=combine_df['x.2'].mean(), inplace=True)

In [None]:
combine_df['x.7'].fillna(value=combine_df['x.7'].mean(), inplace=True)
combine_df['x.10'].fillna(value=combine_df['x.10'].mean(), inplace=True)
combine_df['x.13'].fillna(value=combine_df['x.13'].mean(), inplace=True)
combine_df['x.14'].fillna(value=combine_df['x.14'].mean(), inplace=True)
combine_df['x.19'].fillna(value=combine_df['x.19'].mean(), inplace=True)

In [None]:
combine_df.describe(include=['O'])

Unnamed: 0,x.0,x.3,x.4,x.5,x.6,x.8,x.9,x.11,x.12,x.20,x.16,y
count,3134,3131,3131,3127,3127,3161,3161,3161,3161,981,2493,3161
unique,2,3,3,14,9,4,2,2,3,2,2,2
top,b,u,g,q,v,t,t,f,g,f,f,good
freq,2165,2525,2525,581,1704,2425,1834,1662,2919,561,1991,2552


### Filling in missing values in categorical variables with most frequently occuring element i.e. mode

#### One another way of doing this would be training a classifier over your columns with missing values as a dependent variable against other features of your data set and trying to impute based on the newly trained classifier.

In [None]:
cols = ["x.0","x.3","x.4","x.5","x.6","x.8","x.9","x.11","x.12","x.20","x.16"]
combine_df[cols]=combine_df[cols].fillna(combine_df.mode().iloc[0])

### Analyzing categorical variables

### Feature x.3,x.8,x.11 ,x.20 have 2 unique values so we can label encode them eg 0 or 1 - If there are more than 2 unique values we use one-hot encoding as if you label encode these they send a wrong signal to the model

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()

# Transform the data
combine_df[["x.3","x.8","x.11","x.20"]] = ord_enc.fit_transform(combine_df[["x.3","x.8","x.11","x.20"]])

In [None]:
combine_df[["x.0","x.9","x.16"]] = ord_enc.fit_transform(combine_df[["x.0","x.9","x.16"]])

In [None]:
combine_df.describe(include=['O'])

Unnamed: 0,x.4,x.5,x.6,x.12,y
count,3161,3161,3161,3161,3161
unique,3,14,9,3,2
top,g,q,v,g,good
freq,2555,615,1738,2919,2552


### One -hot encoding for categorical values 

#### One hot encoding variables with unique categories > 2 as model isn't fed any wrong information.

In [None]:
combine_df.describe(include=['O'])

Unnamed: 0,x.4,x.5,x.6,x.12,y
count,3161,3161,3161,3161,3161
unique,3,14,9,3,2
top,g,q,v,g,good
freq,2555,615,1738,2919,2552


In [None]:
x_4_encode = pd.get_dummies(combine_df['x.4'],drop_first=True)

In [None]:
combine_df = pd.concat([combine_df,x_4_encode],axis=1)

In [None]:
x_5_encode = pd.get_dummies(combine_df['x.5'],drop_first=True)

In [None]:
combine_df = pd.concat([combine_df,x_5_encode],axis=1)

In [None]:
x_6_encode = pd.get_dummies(combine_df['x.6'],drop_first=True)

In [None]:
combine_df = pd.concat([combine_df,x_6_encode],axis=1)

In [None]:
x_12_encode = pd.get_dummies(combine_df['x.12'],drop_first=True)

In [None]:
combine_df = pd.concat([combine_df,x_12_encode],axis=1)

In [None]:
del combine_df['x.4']

In [None]:
del combine_df['x.5']

In [None]:
del combine_df['x.6']

In [None]:
del combine_df['x.7']

In [None]:
del combine_df['x.12']

## Dividing dataset into train and test

#### Converting target variable to Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
le = LabelEncoder()

In [None]:
target = combine_df['y']

In [None]:
le.fit(target)

LabelEncoder()

In [None]:
le.classes_

array(['bad', 'good'], dtype=object)

### Dividing X_ and Y_ into train and validation as per the original dataset requirements

In [None]:
target_Y = pd.Series(le.transform(target))

In [None]:
target = pd.DataFrame(target_Y,columns=['target_var'])

In [None]:


#combine_df = combine_df.append(target)

In [None]:
train_df_1 = combine_df.iloc[:(train_df.shape[0]),:]

In [None]:
test_df_1 = combine_df.iloc[2671:,:]

In [None]:
del train_df_1['Unnamed: 0']

In [None]:
del test_df_1['Unnamed: 0']

In [None]:
train_target = target_Y[0:2671]

In [None]:
test_target = target_Y[2671:]

In [None]:
train_df_1.fillna(0,inplace=True)

In [None]:
test_df_1.fillna(0,inplace=True)

In [None]:
del train_df_1['y']

In [None]:
del test_df_1['y']

In [None]:
# Turn the values into an array for feeding the classification algorithms.
X_train = StandardScaler().fit_transform(train_df_1.values)
X_test = StandardScaler().fit_transform(test_df_1.values)
y_train = train_target.values
y_test = test_target.values

In [None]:
X_train.shape

(2671, 40)

In [None]:
# Let's implement simple classifiers

classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

## Good training set accuracy

In [None]:
from sklearn.model_selection import cross_val_score


for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  LogisticRegression Has a training score of 98.0 % accuracy score
Classifiers:  KNeighborsClassifier Has a training score of 97.0 % accuracy score
Classifiers:  SVC Has a training score of 98.0 % accuracy score
Classifiers:  DecisionTreeClassifier Has a training score of 99.0 % accuracy score


### Grid Search to get the best params

In [None]:
# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV


# Logistic Regression 
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}


grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_train, y_train)
log_reg = grid_log_reg.best_estimator_

knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params)
grid_knears.fit(X_train, y_train)
# KNears best estimator
knears_neighbors = grid_knears.best_estimator_

# Support Vector Classifier
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train, y_train)

# SVC best estimator
svc = grid_svc.best_estimator_

# DecisionTree Classifier
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train, y_train)

# tree best estimator
tree_clf = grid_tree.best_estimator_

In [None]:
log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm.fit(X_train, y_train)

LogisticRegression(C=10)

In [None]:
from sklearn.metrics import confusion_matrix

# Logistic Regression fitted using SMOTE technique
y_pred_log_reg = log_reg_sm.predict(X_test)

In [None]:
from sklearn.metrics import classification_report


print('Logistic Regression:')
print(classification_report(y_test, y_pred_log_reg))


Logistic Regression:
              precision    recall  f1-score   support

           0       0.84      0.11      0.19       336
           1       0.33      0.95      0.49       154

    accuracy                           0.38       490
   macro avg       0.59      0.53      0.34       490
weighted avg       0.68      0.38      0.29       490



In [None]:
y_pred_knn = knears_neighbors.predict(X_test)

In [None]:
print('K Nearest Neighbor:')
print(classification_report(y_test, y_pred_knn))



K Nearest Neighbor:
              precision    recall  f1-score   support

           0       0.95      0.56      0.71       336
           1       0.49      0.94      0.65       154

    accuracy                           0.68       490
   macro avg       0.72      0.75      0.68       490
weighted avg       0.81      0.68      0.69       490



In [None]:
y_pred_svm = svc.predict(X_test)

In [None]:
print('SVM:')
print(classification_report(y_test, y_pred_svm))




SVM:
              precision    recall  f1-score   support

           0       0.95      0.21      0.35       336
           1       0.36      0.97      0.53       154

    accuracy                           0.45       490
   macro avg       0.65      0.59      0.44       490
weighted avg       0.76      0.45      0.41       490



In [None]:
y_pred_dt = tree_clf.predict(X_test)

In [None]:
print('Decision Tree:')
print(classification_report(y_test, y_pred_dt))

Decision Tree:
              precision    recall  f1-score   support

           0       0.98      0.12      0.21       336
           1       0.34      0.99      0.51       154

    accuracy                           0.39       490
   macro avg       0.66      0.56      0.36       490
weighted avg       0.78      0.39      0.30       490



## As we can see linear models perform bad on thius dataset - The non linear algorithm KNN performs well on the dataset and the Decision Tree model overfits the data

### We try non-linear models like XGBoost and Random Forests which also reduces bias and variance

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", n_estimators=20, random_state=42, eval_metric=["auc", "error", "error@0.6"])



In [None]:
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric=['auc', 'error', 'error@0.6'], gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=20, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [None]:
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
print('XGB Tree:')
print(classification_report(y_test, y_pred_xgb))

XGB Tree:
              precision    recall  f1-score   support

           0       0.98      0.55      0.70       336
           1       0.50      0.97      0.66       154

    accuracy                           0.68       490
   macro avg       0.74      0.76      0.68       490
weighted avg       0.83      0.68      0.69       490



In [None]:
clf_rf = RandomForestClassifier(n_estimators = 100) 
 
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf_rf.fit(X_train, y_train)
 
# performing predictions on the test dataset
y_pred_rf = clf_rf.predict(X_test)
 

In [None]:
print('Random Forest:')
print(classification_report(y_test, y_pred_rf))

Random Forest:
              precision    recall  f1-score   support

           0       0.97      0.57      0.72       336
           1       0.51      0.97      0.66       154

    accuracy                           0.69       490
   macro avg       0.74      0.77      0.69       490
weighted avg       0.83      0.69      0.70       490



### We clearly see that the accuracy goes up but is still relatively low ~ 70%

#### We can do 2 things 
- Use oversampling techniques such as SMOTE as the dataset is imbalances- more good's than bad
- Get more data . We can't do that as this is a test but we can try SMOTE

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))


Resampled dataset shape Counter({1: 2398, 0: 2398})


In [None]:
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf_rf.fit(X_res, y_res)
 
# performing predictions on the test dataset
y_pred_smote_rf = clf_rf.predict(X_test)
 

In [None]:
print('Random Forest:')
print(classification_report(y_test, y_pred_smote_rf))

Random Forest:
              precision    recall  f1-score   support

           0       0.78      0.96      0.86       336
           1       0.81      0.40      0.54       154

    accuracy                           0.78       490
   macro avg       0.79      0.68      0.70       490
weighted avg       0.79      0.78      0.76       490



# The overall accuracy goes up with SMOTE