In [0]:
import pandas as pd
import numpy as np
from google.colab import drive
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder 
import os

***Week1-4***

In [3]:

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
os.chdir("/content/drive/My Drive")

In [0]:
transaction_train = pd.read_csv('train_transaction.csv')
identity_train = pd.read_csv('train_identity.csv')

***A) Data preprocessing***

Let's join transaction_train and identity_train.

In [0]:
dataset = transaction_train.join(identity_train.set_index('TransactionID'), on= 'TransactionID')  

In [10]:
dataset.shape, transaction_train.shape, identity_train.shape

((590540, 434), (590540, 394), (144233, 41))

We take a look on the head

In [11]:
dataset.head(5)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,...,0.0,70787.0,,,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,,,,,,,,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


 We don't know what each column describes exactly. After looking on kaggle website discussion, we found the following information.

TransactionDT: timedelta from a given reference datetime (not an actual timestamp)

TransactionAMT : That is transaction amount in USD

ProductCD: Product code for each transaction

card1 - card6: Information about payment card 

addr: Address for purchaser and recipient

addr1 as billing region

addr2 as billing country

dist: distance
"distances between (not limited) billing address, mailing address, zip code, IP address, phone area, etc.


P_ and (R_) emaildomain: Purchaser and recipient email domain 

C1-C14: Counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.

D1-D15: Timedelta, such as days between previous transaction, etc.

M1-M9: Match, such as names on card and address, etc.

Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

id01-id11 : Numerical features for identity

IsFraud =1 means fradulent transaction, otherwise nofradulent transaction. 


Let us discover the number of column and row

In [12]:
print("The shape of data :",dataset.shape)

The shape of data : (590540, 434)


We have 434 features with 590540 rows.
What are columns names?

Let's check missing value.

In [13]:
dataset.isnull().sum()

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
id_36             449555
id_37             449555
id_38             449555
DeviceType        449730
DeviceInfo        471874
Length: 434, dtype: int64

We decide to remove features with 75% of NaN in their observations.

In [0]:
for i in dataset.columns:
  percent = dataset[i].isnull().sum()/dataset.shape[0]
  
  if percent >= 0.75 :
    dataset = dataset.drop([i], axis = 1)

Let's extract the target variable (IsFraud).

In [0]:
y = dataset.isFraud 

In [16]:
y.value_counts(normalize= True)

0    0.96501
1    0.03499
Name: isFraud, dtype: float64

We have 96% of non fradulent transaction and 4% of non fradulent transaction.

Now, we drop isFraud and TransactionID in the dataset.

In [0]:
dataset = dataset.drop(['isFraud','TransactionID'], axis = 1 )

In [18]:
print("The shape of trainig data after dropping features:", dataset.shape)

The shape of trainig data after dropping features: (590540, 224)


 Data cleaning

Let's replace missing values. We will use datacleaner library for dealing with numerical features. The function autoclean from datacleaner library replaces Nan of continuous features by its median.

Its documentation can be find there 
https://github.com/rhiever/datacleaner

In [19]:
!pip install datacleaner



In [0]:
from datacleaner import autoclean

We create a mask for categorical features

In [21]:
mask1 =dataset.dtypes==object
categoricalColumns = dataset.columns[mask1]
categoricalFeatures = dataset[categoricalColumns]
categoricalFeatures.head()

Unnamed: 0,ProductCD,card4,card6,P_emaildomain,M1,M2,M3,M4,M5,M6,M7,M8,M9
0,W,discover,credit,,T,T,T,M2,F,T,,,
1,W,mastercard,credit,gmail.com,,,,M0,T,T,,,
2,W,visa,debit,outlook.com,T,T,T,M0,F,F,F,F,F
3,W,mastercard,debit,yahoo.com,,,,M0,T,F,,,
4,H,mastercard,credit,gmail.com,,,,,,,,,


We create a mask for numerical features

In [22]:
mask2 =dataset.dtypes!=object
numericalColumns = dataset.columns[mask2]
numericalFeatures = dataset[numericalColumns]
numericalFeatures.head()

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D10,D11,D15,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,86400,68.5,13926,,150.0,142.0,315.0,87.0,19.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,13.0,13.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,0.0,,0.0,,,,,,,,,,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,0.0,315.0,315.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,84.0,,111.0,,,,,,,,,,...,0.0,0.0,0.0,10.0,0.0,4.0,0.0,0.0,1.0,1.0,1.0,1.0,38.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,1758.0,925.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
numericalFeatures = autoclean(numericalFeatures)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [24]:
numericalFeatures.head()

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D10,D11,D15,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,86400,68.5,13926,361.0,150.0,142.0,315.0,87.0,19.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,97.0,13.0,26.0,10.0,13.0,13.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,8.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,97.0,8.0,0.0,10.0,0.0,43.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,97.0,8.0,0.0,10.0,0.0,315.0,315.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,8.0,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,84.0,43.0,111.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,10.0,0.0,4.0,0.0,0.0,1.0,1.0,1.0,1.0,38.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,1758.0,925.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,8.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,97.0,8.0,26.0,10.0,15.0,43.0,52.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Standardization of numerical features.

In [0]:
numericalFeatures=(numericalFeatures-numericalFeatures.mean())/numericalFeatures.std()

In [27]:
numericalFeatures.head()

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D10,D11,D15,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,-1.577985,-0.278167,0.821695,-0.009783,-0.281424,-1.396379,0.243321,0.069833,-0.138579,-0.09802,-0.09226,-0.037493,-0.059438,-0.21606,-0.112869,-0.046146,-0.053938,-0.208711,-0.054826,-0.087363,-0.047034,-0.243806,-0.147246,-0.508743,-0.285238,-0.132212,-0.480224,-0.242242,-0.556176,-0.585028,-0.768856,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,0.198101,0.005694,-0.26163,-0.355724,-0.164931,-0.332323,-0.427193,-0.393307,-0.133971,-0.040597,-0.063552,-0.045782,-0.033239,-0.05521,-0.100639,-0.141693,-0.094117,-0.099554,-0.156979,-0.163187,-0.52244,-0.454029,-0.500162,-0.002603,-0.059496,-0.066411,-0.076246,-0.094585,-0.334844,-0.041046,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.032814,-0.058049,-0.055287,-0.088855,-0.074142
1,-1.577985,-0.443327,-1.457557,0.26481,-0.281424,-2.368252,0.347542,0.069833,-0.183967,-0.09802,-0.09226,-0.037493,-0.059438,-0.21606,-0.112869,-0.046146,-0.053938,-0.268682,-0.054826,-0.097964,-0.047034,-0.243806,-0.147246,-0.597605,-0.285238,-0.23735,-0.633627,-0.242242,-0.630751,-0.377609,-0.768856,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,0.198101,0.005694,-0.26163,-0.355724,-0.164931,-0.332323,-0.427193,-0.393307,-0.133971,-0.040597,-0.063552,-0.045782,-0.058539,-0.05521,-0.100639,-0.141693,-0.094117,-0.099554,-0.156979,-0.163187,-0.52244,-0.454029,-0.500162,-0.002603,-0.059496,-0.093051,-0.076246,-0.094585,-0.334844,-0.041046,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
2,-1.57797,-0.317889,-1.068262,0.813996,-0.281424,-0.813254,0.399653,0.069833,0.967245,-0.09802,-0.09226,-0.037493,-0.059438,-0.21606,-0.112869,-0.046146,-0.053938,-0.208711,-0.054826,-0.097964,-0.047034,-0.243806,-0.147246,-0.597605,-0.285238,-0.23735,-0.633627,-0.242242,-0.630751,1.502993,0.880013,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,0.198101,0.005694,-0.26163,-0.355724,-0.164931,-0.332323,-0.427193,-0.393307,-0.133971,-0.040597,-0.063552,-0.045782,-0.058539,-0.05521,-0.100639,-0.141693,-0.094117,-0.099554,-0.156979,-0.163187,-0.52244,-0.454029,-0.500162,-0.002603,-0.059496,-0.093051,-0.076246,-0.094585,-0.334844,-0.041046,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
3,-1.577964,-0.35552,1.679857,1.305709,-0.281424,-2.0038,1.92129,0.069833,-0.183967,-0.090533,-0.066398,-0.037493,-0.059438,-0.21606,-0.070916,-0.046146,-0.053938,-0.208711,-0.054826,-0.097964,-0.047034,-0.058284,-0.147246,0.113291,-0.172822,-0.405571,-0.079017,-0.399848,-0.148879,-0.377609,-0.187826,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,-0.887761,-0.636544,-0.26163,2.690805,-0.164931,3.374863,-0.427193,-0.393307,-0.133971,-0.040597,-0.063552,0.002789,0.902852,0.869194,-0.100639,-0.141693,-0.094117,-0.099554,-0.156979,-0.163187,-0.52244,-0.454029,-0.500162,-0.002603,-0.038209,0.307227,0.229853,-0.094585,0.668046,-0.041046,0.556723,-0.222385,-0.249222,-0.229148,-0.026351,0.290552,0.224769,-0.055287,-0.088855,-0.074142
4,-1.577962,-0.35552,-1.102132,0.967258,-0.281424,-2.368252,1.337648,0.069833,-0.183967,-0.09802,-0.09226,-0.037493,-0.059438,-0.21606,-0.112869,-0.046146,-0.043454,-0.268682,-0.044364,-0.097964,-0.047034,-0.243806,-0.147246,-0.597605,-0.285238,-0.23735,-0.480224,-0.242242,-0.544703,-0.377609,-0.496662,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,0.198101,0.005694,-0.26163,-0.355724,-0.164931,-0.332323,-0.427193,-0.393307,-0.133971,-0.040597,-0.063552,-0.045782,-0.058539,-0.05521,-0.100639,-0.141693,-0.094117,-0.099554,-0.156979,-0.163187,1.552744,1.149554,1.39294,-0.002603,-0.059496,-0.093051,-0.076246,-0.094585,-0.334844,-0.041046,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142


In [0]:
def encoding():
  x = pd.get_dummies(data=categoricalFeatures, columns= categoricalFeatures.columns)
  encoded = numericalFeatures.join(x)
  return encoded
  

In [0]:
datasetEncoded = encoding()

In [30]:
datasetEncoded.shape

(590540, 302)

In [31]:
datasetEncoded.head()

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D10,D11,D15,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,P_emaildomain_protonmail.com,P_emaildomain_ptd.net,P_emaildomain_q.com,P_emaildomain_roadrunner.com,P_emaildomain_rocketmail.com,P_emaildomain_sbcglobal.net,P_emaildomain_sc.rr.com,P_emaildomain_servicios-ta.com,P_emaildomain_suddenlink.net,P_emaildomain_twc.com,P_emaildomain_verizon.net,P_emaildomain_web.de,P_emaildomain_windstream.net,P_emaildomain_yahoo.co.jp,P_emaildomain_yahoo.co.uk,P_emaildomain_yahoo.com,P_emaildomain_yahoo.com.mx,P_emaildomain_yahoo.de,P_emaildomain_yahoo.es,P_emaildomain_yahoo.fr,P_emaildomain_ymail.com,M1_F,M1_T,M2_F,M2_T,M3_F,M3_T,M4_M0,M4_M1,M4_M2,M5_F,M5_T,M6_F,M6_T,M7_F,M7_T,M8_F,M8_T,M9_F,M9_T
0,-1.577985,-0.278167,0.821695,-0.009783,-0.281424,-1.396379,0.243321,0.069833,-0.138579,-0.09802,-0.09226,-0.037493,-0.059438,-0.21606,-0.112869,-0.046146,-0.053938,-0.208711,-0.054826,-0.087363,-0.047034,-0.243806,-0.147246,-0.508743,-0.285238,-0.132212,-0.480224,-0.242242,-0.556176,-0.585028,-0.768856,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0
1,-1.577985,-0.443327,-1.457557,0.26481,-0.281424,-2.368252,0.347542,0.069833,-0.183967,-0.09802,-0.09226,-0.037493,-0.059438,-0.21606,-0.112869,-0.046146,-0.053938,-0.268682,-0.054826,-0.097964,-0.047034,-0.243806,-0.147246,-0.597605,-0.285238,-0.23735,-0.633627,-0.242242,-0.630751,-0.377609,-0.768856,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0
2,-1.57797,-0.317889,-1.068262,0.813996,-0.281424,-0.813254,0.399653,0.069833,0.967245,-0.09802,-0.09226,-0.037493,-0.059438,-0.21606,-0.112869,-0.046146,-0.053938,-0.208711,-0.054826,-0.097964,-0.047034,-0.243806,-0.147246,-0.597605,-0.285238,-0.23735,-0.633627,-0.242242,-0.630751,1.502993,0.880013,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,1,0
3,-1.577964,-0.35552,1.679857,1.305709,-0.281424,-2.0038,1.92129,0.069833,-0.183967,-0.090533,-0.066398,-0.037493,-0.059438,-0.21606,-0.070916,-0.046146,-0.053938,-0.208711,-0.054826,-0.097964,-0.047034,-0.058284,-0.147246,0.113291,-0.172822,-0.405571,-0.079017,-0.399848,-0.148879,-0.377609,-0.187826,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0
4,-1.577962,-0.35552,-1.102132,0.967258,-0.281424,-2.368252,1.337648,0.069833,-0.183967,-0.09802,-0.09226,-0.037493,-0.059438,-0.21606,-0.112869,-0.046146,-0.043454,-0.268682,-0.044364,-0.097964,-0.047034,-0.243806,-0.147246,-0.597605,-0.285238,-0.23735,-0.480224,-0.242242,-0.544703,-0.377609,-0.496662,0.005365,-0.135535,-0.174217,0.246322,0.184755,-0.137377,-0.171279,-0.107532,-0.132158,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


***Week5-Today***

***B) Training and prediction***

Let's define auc function which return auc_score.

In [0]:
from sklearn.metrics import roc_auc_score
def auc(y_true, y_pred):
    """
        return the AUC
        ex : auc(y_true, y_prediction)
    """
    return roc_auc_score(y_true, y_pred)

***Hyperparamaters tunning for Knn***

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [0]:
p = 0.3
X_train, X_test, y_train, y_test = train_test_split( datasetEncoded, y, test_size = p, random_state = 42)

In [0]:
def auc(y_true, y_pred):
    """
        return the AUC
        ex : auc(y_true, y_prediction)
    """
    return roc_auc_score(y_true, y_pred)

In [0]:
model = KNeighborsClassifier()
parameters = {'n_neighbors': [2,3,5]} 

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
grid_search = GridSearchCV(model, parameters, cv=3)

***Hyperparamaters tunning for GradientBoostingClassifier***

In [0]:
from sklearn.ensemble import GradientBoostingClassifier


In [0]:
model = GradientBoostingClassifier()
parameters = parameters = {'n_neighbors': np.arange(1000, 1500,100)}
grid_search = GridSearchCV(model,parameters, cv= 3)