# Santander Customer Transaction Prediction - Group 7

Alexandra Ionascu, Shruti Bajpai, Laura El Aoufir

In [1]:
# import the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

from pandas                 import DataFrame
from pandas                 import Series
from pandas                 import read_csv
from pandas                 import get_dummies
from numpy                  import array
from numpy                  import random
from numpy                  import where
from numpy                  import nan
from scipy.stats.mstats     import winsorize
from sklearn.linear_model   import LogisticRegression
from sklearn.tree           import DecisionTreeClassifier
from sklearn.ensemble       import RandomForestClassifier
from sklearn.ensemble       import GradientBoostingClassifier
from sklearn.svm            import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors      import KNeighborsClassifier
from sklearn.metrics        import accuracy_score
from sklearn.metrics        import auc
from sklearn.metrics        import roc_auc_score
from scipy.stats            import pearsonr
from sklearn.metrics        import roc_curve
from matplotlib             import pyplot
from sklearn.model_selection  import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD


!pip install pandasql
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())  # Allow sqldf to access global environment



In [2]:
# Pr. Houdart functions

from pandas             import DataFrame
from pandas             import Series
from pandas             import read_csv
from numpy              import array
from numpy              import random


def roc (dataSet: DataFrame, actuals: str, probability: str) -> DataFrame:

    (fpr,tpr,threshold) = roc_curve(array(dataSet[actuals]), array(dataSet[probability]), pos_label = 1)

    returnData = DataFrame(tpr)
    returnData.columns = ["True positive rate"]
    returnData["False positive rate"] = DataFrame(fpr)

    return returnData

def lift (dataSet: DataFrame, actuals: str, probability: str, precision: int = 20) -> DataFrame:

    summary = cumulativeResponse(dataSet = dataSet, actuals = actuals, probability = probability, precision = precision)

    summary["Lift"] = summary["Cumulative response"] / Series(summary["Average response"]).max()
    summary["Base"] = summary["Average response"] / Series(summary["Average response"]).max()

    return summary[["Quantile","Lift","Base"]]

def cumulativeResponse (dataSet: DataFrame, actuals: str, probability: str, precision: int = 20) -> DataFrame:

    internalSet = equifrequentBinning (dataSet = dataSet[[actuals, probability]], byColumn = probability, into = precision)

    internalSet["Quantile"] = internalSet[probability + "_bin"] / precision
    internalSet["obs"]      = 1

    summary = internalSet[["Quantile", actuals, "obs"]].groupby(["Quantile"], as_index = False).sum().sort_values(by = "Quantile", ascending = False)

    summary["cumulativeTarget"]     = Series(summary[actuals]).cumsum(skipna = False)
    summary["cumulativeAll"]        = Series(summary["obs"]).cumsum(skipna = False)
    summary["Cumulative response"]  = summary["cumulativeTarget"] / summary["cumulativeAll"]
    summary["Average response"]     = Series(summary["cumulativeTarget"]).max() / Series(summary["cumulativeAll"]).max()

    return summary[["Quantile","Cumulative response","Average response"]]

def cumulativeGains (dataSet: DataFrame, actuals: str, probability: str, precision: int = 20) -> DataFrame:

    internalSet = equifrequentBinning (dataSet = dataSet[[actuals, probability]], byColumn = probability, into = precision)

    internalSet["Quantile"] = internalSet[probability + "_bin"] / precision
    internalSet["obs"]      = 1

    summary = internalSet[["Quantile", actuals, "obs"]].groupby(["Quantile"], as_index = False).sum().sort_values(by = "Quantile", ascending = False)

    summary["cumulativeTarget"]     = Series(summary[actuals]).cumsum(skipna = False)
    summary["cumulativeAll"]        = Series(summary["obs"]).cumsum(skipna = False)
    summary["Cumulative gains"]     = summary["cumulativeTarget"] / Series(summary["cumulativeTarget"]).max()
    summary["Base"]                 = summary["Quantile"]

    return summary[["Quantile","Cumulative gains","Base"]]

def equifrequentBinning (dataSet: DataFrame, byColumn: str, into: int) -> DataFrame:

    internalSet = dataSet

    quanitles = []

    for i in range(into):
        quanitles.append(1 / into * (i))

    quantile = internalSet.quantile(quanitles, axis = 0)[byColumn].to_dict()

    internalSet["Bin"] = 0

    for q in quantile:
        upperBound = quantile[q]
        internalSet.loc[internalSet[byColumn] >= upperBound, byColumn + "_bin"] = int(q * into +1)

    return internalSet

def partition (dataFrame : DataFrame, splitStrategy: [float]) -> [DataFrame]:

    def assignPartition (toDataFrame: DataFrame, lowerBound: float, upperBound: float, index: int) -> int:
        if toDataFrame["random"] >= lowerBound * observations and toDataFrame["random"] < upperBound * observations:
            return index
        else:
            return int(toDataFrame["Split"])

    if type(splitStrategy) != list:
        raise KeyError("Split strategy must be an array of floating point values.")
    elif sum(splitStrategy) != 1:
        raise ValueError("Split strategy must sum to 1.")
    else:
        observations = dataFrame.shape[0]
        partitions   = len(splitStrategy)

        cumulativeSplit = 0

        data = dataFrame.copy()
        data["random"] = random.permutation(observations)
        data["Split"]  = 0

    for index, split in enumerate(splitStrategy):
        lowerSplit = cumulativeSplit
        upperSplit = cumulativeSplit + split + 1
        cumulativeSplit += split
        data["Split"] = data.apply(lambda x: assignPartition(x,lowerSplit,upperSplit,index+1), axis = 1)

    partitions = []

    for i in range(len(splitStrategy)):
        partitions.append(data.loc[data["Split"] == i+1].drop(["Split","random"], axis = 1).reset_index(drop = True))

    return partitions


In [3]:
# options to display full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# I. Reading In Data

In [3]:
# load the data
train = pd.read_csv('train.csv', sep=',')
test = pd.read_csv('test.csv', sep=',')
sample = pd.read_csv('sample_submission.csv', sep=',')

In [4]:
# check train
train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [5]:
# check train size
train.shape

(200000, 202)

In [18]:
# check train info
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB


In [19]:
# check train columns
train.columns

Index(['ID_code', 'target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195', 'var_196', 'var_197', 'var_198', 'var_199'], dtype='object', length=202)

In [20]:
# check train data types
train.dtypes

ID_code     object
target       int64
var_0      float64
var_1      float64
var_2      float64
var_3      float64
var_4      float64
var_5      float64
var_6      float64
var_7      float64
var_8      float64
var_9      float64
var_10     float64
var_11     float64
var_12     float64
var_13     float64
var_14     float64
var_15     float64
var_16     float64
var_17     float64
var_18     float64
var_19     float64
var_20     float64
var_21     float64
var_22     float64
var_23     float64
var_24     float64
var_25     float64
var_26     float64
var_27     float64
var_28     float64
var_29     float64
var_30     float64
var_31     float64
var_32     float64
var_33     float64
var_34     float64
var_35     float64
var_36     float64
var_37     float64
var_38     float64
var_39     float64
var_40     float64
var_41     float64
var_42     float64
var_43     float64
var_44     float64
var_45     float64
var_46     float64
var_47     float64
var_48     float64
var_49     float64
var_50     f

In [21]:
# check train missing values
train.isnull().sum()

ID_code    0
target     0
var_0      0
var_1      0
var_2      0
var_3      0
var_4      0
var_5      0
var_6      0
var_7      0
var_8      0
var_9      0
var_10     0
var_11     0
var_12     0
var_13     0
var_14     0
var_15     0
var_16     0
var_17     0
var_18     0
var_19     0
var_20     0
var_21     0
var_22     0
var_23     0
var_24     0
var_25     0
var_26     0
var_27     0
var_28     0
var_29     0
var_30     0
var_31     0
var_32     0
var_33     0
var_34     0
var_35     0
var_36     0
var_37     0
var_38     0
var_39     0
var_40     0
var_41     0
var_42     0
var_43     0
var_44     0
var_45     0
var_46     0
var_47     0
var_48     0
var_49     0
var_50     0
var_51     0
var_52     0
var_53     0
var_54     0
var_55     0
var_56     0
var_57     0
var_58     0
var_59     0
var_60     0
var_61     0
var_62     0
var_63     0
var_64     0
var_65     0
var_66     0
var_67     0
var_68     0
var_69     0
var_70     0
var_71     0
var_72     0
var_73     0
var_74     0

In [22]:
# check train number of unique values per column
train.nunique()

ID_code    200000
target          2
var_0       94672
var_1      108932
var_2       86555
var_3       74597
var_4       63515
var_5      141029
var_6       38599
var_7      103063
var_8       98617
var_9       49417
var_10     128764
var_11     130193
var_12       9561
var_13     115181
var_14      79122
var_15      19810
var_16      86918
var_17     137823
var_18     139515
var_19     144180
var_20     127764
var_21     140062
var_22      90660
var_23      24913
var_24     105101
var_25      14853
var_26     127089
var_27      60185
var_28      35859
var_29      88339
var_30     145977
var_31      77388
var_32      85964
var_33     112239
var_34      25164
var_35     122384
var_36      96404
var_37      79040
var_38     115366
var_39     112674
var_40     141878
var_41     131896
var_42      31592
var_43      15188
var_44     127702
var_45     169968
var_46      93450
var_47     154781
var_48     152039
var_49     140641
var_50      32308
var_51     143455
var_52     121313
var_53    

### Train: Remove outliers and standardized

In [6]:
# winsorizing to take out outliers
for col in train.drop(['ID_code','target'],axis=1).columns:
    train[col] = winsorize(array(train[col]), limits=[0.00,0.01])

In [7]:
# standardization with min max scale
from sklearn import preprocessing

dont_include_cols = ['ID_code','target']

x = train.drop(dont_include_cols, axis=1).values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
train_st = pd.DataFrame(x_scaled)

In [8]:
# rename columns
train_st.columns = train.drop(dont_include_cols, axis=1).columns
train_st['target'] = train['target']
train_st['ID_code'] = train['ID_code']

### Test: Remove outliers and standardized

In [26]:
test.nunique()

ID_code    200000
var_0       65580
var_1       71661
var_2       61865
var_3       56507
var_4       49995
var_5       83228
var_6       33273
var_7       69487
var_8       67521
var_9       41583
var_10      79221
var_11      79749
var_12       9121
var_13      74037
var_14      58951
var_15      18253
var_16      61906
var_17      82518
var_18      82682
var_19      84370
var_20      78645
var_21      82738
var_22      63855
var_23      22619
var_24      70202
var_25      13728
var_26      78260
var_27      48428
var_28      31321
var_29      62618
var_30      84985
var_31      57146
var_32      61890
var_33      73157
var_34      22954
var_35      76756
var_36      66309
var_37      58742
var_38      74294
var_39      73292
var_40      83405
var_41      80327
var_42      28163
var_43      14288
var_44      78457
var_45      92058
var_46      65189
var_47      87427
var_48      86929
var_49      82973
var_50      28412
var_51      83881
var_52      76266
var_53      29631
var_54    

In [9]:
# winsorizing to take out outliers
for col in test.drop(['ID_code'],axis=1).columns:
    test[col] = winsorize(array(test[col]), limits=[0.00,0.01])

In [10]:
# standardization with min max scale
from sklearn import preprocessing

dont_include_cols = ['ID_code']

x = test.drop(dont_include_cols, axis=1).values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
test_st = pd.DataFrame(x_scaled)

In [11]:
# rename columns
test_st.columns = test.drop(dont_include_cols, axis=1).columns
test_st['ID_code'] = test['ID_code']

## Modelisation

In [30]:
train_st.columns.to_list()

['var_0',
 'var_1',
 'var_2',
 'var_3',
 'var_4',
 'var_5',
 'var_6',
 'var_7',
 'var_8',
 'var_9',
 'var_10',
 'var_11',
 'var_12',
 'var_13',
 'var_14',
 'var_15',
 'var_16',
 'var_17',
 'var_18',
 'var_19',
 'var_20',
 'var_21',
 'var_22',
 'var_23',
 'var_24',
 'var_25',
 'var_26',
 'var_27',
 'var_28',
 'var_29',
 'var_30',
 'var_31',
 'var_32',
 'var_33',
 'var_34',
 'var_35',
 'var_36',
 'var_37',
 'var_38',
 'var_39',
 'var_40',
 'var_41',
 'var_42',
 'var_43',
 'var_44',
 'var_45',
 'var_46',
 'var_47',
 'var_48',
 'var_49',
 'var_50',
 'var_51',
 'var_52',
 'var_53',
 'var_54',
 'var_55',
 'var_56',
 'var_57',
 'var_58',
 'var_59',
 'var_60',
 'var_61',
 'var_62',
 'var_63',
 'var_64',
 'var_65',
 'var_66',
 'var_67',
 'var_68',
 'var_69',
 'var_70',
 'var_71',
 'var_72',
 'var_73',
 'var_74',
 'var_75',
 'var_76',
 'var_77',
 'var_78',
 'var_79',
 'var_80',
 'var_81',
 'var_82',
 'var_83',
 'var_84',
 'var_85',
 'var_86',
 'var_87',
 'var_88',
 'var_89',
 'var_90',
 'var_91'

In [12]:
# pearson correlation to select the features (sign = 0.005
selectedFeatures = []
target = "target"

trainingSet_columns = train_st.drop(['target','ID_code'],axis=1).columns.to_list()

for column in trainingSet_columns:
        (pearson,pvalue) = pearsonr(train_st[column],train_st[target])
        print(f"{column} - p-value = {pvalue} - selected : {1 if pvalue < 0.05 else 0}")
        if pvalue < 0.05:
            selectedFeatures.append(column)

var_0 - p-value = 7.6429054662627255e-121 - selected : 1
var_1 - p-value = 2.6993783820423167e-111 - selected : 1
var_2 - p-value = 5.020262235878718e-137 - selected : 1
var_3 - p-value = 1.307363971085223e-06 - selected : 1
var_4 - p-value = 1.026604141928873e-06 - selected : 1
var_5 - p-value = 5.1233323926587986e-43 - selected : 1
var_6 - p-value = 8.783748987303271e-195 - selected : 1
var_7 - p-value = 0.17147548510010868 - selected : 0
var_8 - p-value = 1.6177855821477955e-18 - selected : 1
var_9 - p-value = 9.427348235719134e-82 - selected : 1
var_10 - p-value = 0.347537734860646 - selected : 0
var_11 - p-value = 1.7088149397247163e-24 - selected : 1
var_12 - p-value = 5.700002761674831e-214 - selected : 1
var_13 - p-value = 2.946740027691815e-135 - selected : 1
var_14 - p-value = 0.005189733791129805 - selected : 1
var_15 - p-value = 1.552183788251263e-14 - selected : 1
var_16 - p-value = 0.0002917524031947955 - selected : 1
var_17 - p-value = 0.6987301345084505 - selected : 0
v

var_162 - p-value = 1.1833517466506236e-47 - selected : 1
var_163 - p-value = 2.6913691188184742e-45 - selected : 1
var_164 - p-value = 1.360258900648927e-74 - selected : 1
var_165 - p-value = 9.374909533978049e-139 - selected : 1
var_166 - p-value = 1.0287488836159223e-148 - selected : 1
var_167 - p-value = 3.9006273072893835e-43 - selected : 1
var_168 - p-value = 3.7955515917734415e-10 - selected : 1
var_169 - p-value = 2.4150657185979448e-104 - selected : 1
var_170 - p-value = 9.113883261581193e-100 - selected : 1
var_171 - p-value = 4.4729653673018545e-11 - selected : 1
var_172 - p-value = 3.379461219992791e-65 - selected : 1
var_173 - p-value = 3.8329974428562924e-79 - selected : 1
var_174 - p-value = 1.7933796898589801e-168 - selected : 1
var_175 - p-value = 4.887649269662728e-22 - selected : 1
var_176 - p-value = 0.000865187638027705 - selected : 1
var_177 - p-value = 1.493599665363021e-61 - selected : 1
var_178 - p-value = 9.11303093974387e-19 - selected : 1
var_179 - p-value =

In [13]:
# LASSO REGRESSION
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

trainingSet_columns = train_st.drop(['target','ID_code'],axis=1).columns.to_list()
target = "target"

scaler = StandardScaler()
scaler.fit(train_st[trainingSet_columns].fillna(0))

sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l2'))
sel_.fit(scaler.transform(train_st[trainingSet_columns].fillna(0)), train_st[target])

SelectFromModel(estimator=LogisticRegression(C=1))

In [14]:
selected_feat = train_st[trainingSet_columns].columns[(sel_.get_support())]
print('total features: {}'.format((train_st[trainingSet_columns].shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 200
selected features: 97
features with coefficients shrank to zero: 0


In [26]:
basetable = pd.DataFrame(train_st[selectedFeatures], train_st[target])
basetable.shape

(200000, 181)

In [27]:
basetable.to_csv("basetable.csv")

In [34]:
# selectedFeatures = selected_feat

In [29]:
# check length of columns
print(len(trainingSet_columns))
print(len(selectedFeatures))

200
181


In [31]:
# instanciate the models
logistic     = LogisticRegression()
tree         = DecisionTreeClassifier()
randomForest = RandomForestClassifier()
boostedTree  = GradientBoostingClassifier()
neuralNet    = MLPClassifier()
neighbors    = KNeighborsClassifier()

In [32]:
# create a dict to loop through the models later on
models = {"logistic"     :logistic,
          "tree"         :tree,
          "boostedTree"  :boostedTree }

In [14]:
train['target'].value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [16]:
20098/200000

0.10049

## Validation set

In [17]:
# create train and test set partition
pre_train, valid = train_test_split(train_st, test_size=0.2, random_state=42)

In [65]:
pre_train.to_csv("basetable.csv")

In [48]:
# fit the models
for model in models:
    models[model].fit(pre_train[selectedFeatures],pre_train[target])
    
    print(f"{model} has been trained successfully")

logistic has been trained successfully
tree has been trained successfully
boostedTree has been trained successfully


## Accuracy and AUC function definition


In [51]:
# compute AUC and accuracy scores

performances = {}

for model in models:
    predictions   = models[model].predict(valid[selectedFeatures])
    probabilities = DataFrame(models[model].predict_proba(valid[selectedFeatures]))[1]
    accuracy      = accuracy_score(valid[target],predictions)
    auc           = roc_auc_score(array(valid[target]),array(probabilities))
    
    performances[model] = {"Accuracy":accuracy,"AUC":auc}

In [59]:
# get predict on kaggle test set
test_st['proba_lr'] = DataFrame(models['logistic'].predict(test_st[selectedFeatures]))
test_st['proba_lr'].value_counts()

0    193394
1      6606
Name: proba_lr, dtype: int64

In [60]:
# get predict on kaggle test set
test_st['proba_boosted'] = DataFrame(models['boostedTree'].predict(test_st[selectedFeatures]))
test_st['proba_boosted'].value_counts()

0    199347
1       653
Name: proba_boosted, dtype: int64

In [62]:
# get predict on kaggle test set
test_st['proba_tree'] = DataFrame(models['tree'].predict(test_st[selectedFeatures]))
test_st['proba_tree'].value_counts()

0    174323
1     25677
Name: proba_tree, dtype: int64

In [52]:
# format to dataframe
pd.DataFrame(performances)

Unnamed: 0,logistic,tree,boostedTree
Accuracy,0.912925,0.833475,0.900275
AUC,0.859806,0.560622,0.83339


In [63]:
# save predictions to csv
proba_cols = [col for col in test_st.columns if 'proba' in col]
for col in proba_cols:
    predictions_csv = test_st[['ID_code',col]]
    predictions_csv.rename({col:'target'},axis=1,inplace=True)
    predictions_csv.to_csv('predictions_' + col + '_csv.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


### Cross validation setup

In [42]:
# instanciate number of folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [43]:
# split the train data
kf.split(pre_train)

<generator object _BaseKFold.split at 0x0000014936C4BCF0>

In [44]:
# calculate AUC score for all models' fold
from sklearn.model_selection import cross_val_score

for model in models:
    print(model, cross_val_score(models[model],valid[selectedFeatures],valid[target],scoring="roc_auc"))

logistic [0.84881009 0.86134826 0.85585673 0.8633326  0.85106172]
tree [0.54863456 0.55397978 0.55396865 0.55341395 0.55069808]
boostedTree [0.8182349  0.82954765 0.82124175 0.82144405 0.81591531]


In [45]:
# logistic_cv = [0.84880839, 0.86134809, 0.85585673, 0.8633326, 0.85106172]
# tree_cv = [0.55477188, 0.54090193, 0.551789, 0.5460799, 0.55073205]
# boostedTree_cv = [0.81819749, 0.82954765, 0.82119992, 0.82144405, 0.81591531]

In [46]:
# # compute mean and standard deviation

# import statistics as stat

# results_cv = {"logistic_cv":logistic_cv,
#               "tree_cv":tree_cv,
# #               "randomForest_cv":randomForest_cv,
#               "boostedTree_cv":boostedTree_cv,
# #               "neuralNet_cv":neuralNet_cv,
# #               "neighbors_cv":neighbors_cv
#              }

# for key, model in results_cv.items():
#     print(key, "---mean",round(sum(model) / len(model),3), "---stddev", round(stat.stdev(model),3), "---variance", round(np.var(model),5))

logistic_cv ---mean 0.856 ---stddev 0.006 ---variance 3e-05
tree_cv ---mean 0.549 ---stddev 0.005 ---variance 2e-05
boostedTree_cv ---mean 0.821 ---stddev 0.005 ---variance 2e-05


### Decision tree with grid search and oversampling

In [18]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(train_st[selectedFeatures], train_st[target])

In [19]:
#train test split for oversampling
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [20]:
from sklearn.model_selection import GridSearchCV

dtree_model = DecisionTreeClassifier()
param_grid = {'max_depth': np.arange(3, 10)}

dtree_gs = GridSearchCV(dtree_model, param_grid, cv=5)
dtree_model = dtree_gs.fit(pre_train[selectedFeatures], pre_train[target])


In [21]:
dtree_predictions = dtree_model.predict(valid[selectedFeatures])
dtree_score = dtree_model.score(valid[selectedFeatures], valid[target])
roc = roc_auc_score(valid[target], dtree_model.predict_proba(valid[selectedFeatures])[:,1])
print(dtree_gs, dtree_score,roc)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': array([3, 4, 5, 6, 7, 8, 9])}) 0.897625 0.6067134650790798


In [23]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(dtree_model, X_val,y_val, scoring='accuracy', cv=cv, n_jobs=-1)
print(scores)

[0.63832152 0.64188438 0.62979433 0.62076153 0.63229572 0.62729294
 0.64188438 0.62576431 0.63715953 0.6459144 ]


In [22]:
predition_tree = dtree_model.predict(test_st[selectedFeatures])
test_st['pred_tree']  = predition_tree
pd.DataFrame(test_st[['ID_code','pred_tree']]).to_csv("decision_tree_smote.csv")

### Logistic regression + Oversampling

In [15]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(train_st[selectedFeatures], train_st[target])

In [16]:
#train test split for oversampling
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [18]:
lg =  LogisticRegression()
lg_model_smote = lg.fit(X_train, y_train)
lg_predictions_smote = lg_model_smote.predict(X_test)
roc = roc_auc_score(y_test, lg_model_smote.predict_proba(X_test)[:,1])
lg_score = lg_model_smote.score(X_test, y_test)
print(lg_model_smote, lg_score, roc)

LogisticRegression() 0.7988632731618516 0.8779990731317955


In [25]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(lg_model_smote, X_val,y_val, scoring='accuracy', cv=cv, n_jobs=-1)

In [26]:
print(scores)

[0.79824927 0.81183991 0.79571984 0.7979433  0.79655364 0.79474708
 0.79446915 0.78474152 0.79655364 0.80600334]


In [23]:
### Predict on kaggdle test set
result_test_log_smote = lg_model_smote.predict(test_st[selectedFeatures])
test['prdictions'] = result_test_log_smote
result_test_log_smote_df = pd.DataFrame(test[['ID_code','prdictions']])
result_test_log_smote_df
result_test_log_smote_df.to_csv("Lasso_Logistic_smote_Laura.csv")

### Random forest with grid search

In [None]:
from sklearn.ensemble import RandomForestClassifier


rfc_model = RandomForestClassifier(oob_score = True) 
param_grid = {'n_estimators': [50, 100, 200, 400], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10,40,50,70,100],
             'min_samples_split': [2, 5, 10],'min_samples_leaf' : [1, 2, 4]}
rfc_gs = GridSearchCV(estimator=rfc_model, param_grid=param_grid, cv= 5)
rfc_model = rfc_gs.fit(X_train, y_train)
rfc_predictions = rfc_model.predict(X_test)
roc_rfc = roc_auc_score(y_test, rfc_model.predict_proba(X_test)[:,1])
rfc_score = rfc_model.score(X_test, y_test)

print("Accuracy: ", rfc_score, "AUC:",  roc_rfc)

In [None]:
rfc_result = lg_model.predict(test_filtered)
test['prdictions_rfc'] = rfc_result
rfc_result_df = pd.DataFrame(test[['ID_code','prdictions_rfc']])
rfc_result_df.to_csv("Rfc.csv")
