# Exploratory Notebook on Training Data

Goal: Try to find coorelations:

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
csv_path = os.path.join("..","data","train.csv")
train_data = pd.read_csv(csv_path)

In [3]:
train_data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


- All predictions came out to be 0. Hence the 90% mark.

https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/

## Will need to Undersample 0's or Oversample 1's:
- Do not need all 200,000 rows if samples some are similar.
- Start by eliminating random entries and see if that makes a difference
- Create an algo to eliminate similar samples... maybe quantize samples and remove any duplicates


## Start with random undersampling

In [4]:
train_data_0 = train_data.loc[train_data.target == 0]
train_data_1 = train_data.loc[train_data.target == 1]

In [5]:
print(f"Length of target 0 samples: {len(train_data_0)}")
print(f"Length of target 1 samples: {len(train_data_1)}")

Length of target 0 samples: 179902
Length of target 1 samples: 20098


In [6]:
train_data_0.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [7]:
train_data_0_downsampled = train_data_0.sample(n=len(train_data_1))

In [8]:
# N = 5000 # x2 How many samples you want to try
train_data_undersampled = pd.concat([train_data_0_downsampled, train_data_1])

### Now try the algorithm again to see if the prediction shows different results. 

In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [10]:
# Split into X and y
X = train_data_undersampled.drop(columns=["ID_code", "target"])
y = train_data_undersampled["target"].values.reshape(-1, 1)

In [11]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
steps = [("min_max_scaler", MinMaxScaler()), ("SVM", SVC())]
pipeline = Pipeline(steps)

In [13]:
parameteres = {'SVM__C': [10, 100, 500],
              'SVM__gamma': [0.001, 0.01]}

# C is not working for 1, try higher
# .00001 is not working well... need to try higher values:

In [14]:
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5, verbose=3)

In [15]:
grid.fit(X_train, y_train.ravel())

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] SVM__C=10, SVM__gamma=0.001 .....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  SVM__C=10, SVM__gamma=0.001, score=0.7681592039800995, total= 2.4min
[CV] SVM__C=10, SVM__gamma=0.001 .....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.8min remaining:    0.0s


[CV]  SVM__C=10, SVM__gamma=0.001, score=0.7849087893864013, total= 2.4min
[CV] SVM__C=10, SVM__gamma=0.001 .....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  7.6min remaining:    0.0s


[CV]  SVM__C=10, SVM__gamma=0.001, score=0.7752529441034998, total= 2.4min
[CV] SVM__C=10, SVM__gamma=0.001 .....................................
[CV]  SVM__C=10, SVM__gamma=0.001, score=0.773262564272682, total= 2.4min
[CV] SVM__C=10, SVM__gamma=0.001 .....................................
[CV]  SVM__C=10, SVM__gamma=0.001, score=0.777077458948416, total= 2.4min
[CV] SVM__C=10, SVM__gamma=0.01 ......................................
[CV]  SVM__C=10, SVM__gamma=0.01, score=0.7922056384742951, total= 2.1min
[CV] SVM__C=10, SVM__gamma=0.01 ......................................
[CV]  SVM__C=10, SVM__gamma=0.01, score=0.8041459369817578, total= 2.1min
[CV] SVM__C=10, SVM__gamma=0.01 ......................................
[CV]  SVM__C=10, SVM__gamma=0.01, score=0.790678387792337, total= 2.1min
[CV] SVM__C=10, SVM__gamma=0.01 ......................................
[CV]  SVM__C=10, SVM__gamma=0.01, score=0.7939956875103665, total= 2.1min
[CV] SVM__C=10, SVM__gamma=0.01 ........................

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 475.8min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('SVM', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'SVM__C': [10, 100, 500], 'SVM__gamma': [0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [16]:
print(f"Score = {grid.score(X_test,y_test.ravel())}")
print(f"Best Parameters: {grid.best_params_}")
print(f"Best Score: {grid.best_score_}")

Score = 0.795601552393273
Best Parameters: {'SVM__C': 10, 'SVM__gamma': 0.01}
Best Score: 0.7953361860218264


In [17]:
predictions = grid.predict(X_test)

In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_test.ravel(), predictions.ravel()))

              precision    recall  f1-score   support

           0       0.79      0.80      0.80      4996
           1       0.80      0.79      0.80      5053

   micro avg       0.80      0.80      0.80     10049
   macro avg       0.80      0.80      0.80     10049
weighted avg       0.80      0.80      0.80     10049



In [19]:
# Score = 0.7909090909090909
# Best Parameters: {'SVM__C': 100, 'SVM__gamma': 0.01}