In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Signate MUFG 데이터 분석 프로젝트

- 1. Preprocessing

- 2. EDA

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

import missingno as msno
import warnings

pio.templates.default = 'plotly_white'
warnings.filterwarnings('ignore')

In [None]:
path = '/content/drive/MyDrive/Signate/2023.08.30 MUFG/data/'

In [None]:
train_df = pd.read_csv(path + 'merge_train_df.csv')
test_df = pd.read_csv(path + 'merge_test_df.csv')

In [None]:
train_df.drop(['Unnamed: 0'] , axis = 1 , inplace = True)
test_df.drop(['Unnamed: 0'] , axis = 1 , inplace = True)

In [None]:
target = train_df['is_fraud?']
train_df.drop(['is_fraud?'] , axis = 1 , inplace = True)

### 1. Preprocessing

In [None]:
train_df.info()

In [None]:
train_df.iloc[: , 0 : 10].head(3)

In [None]:
train_df.iloc[: , 11 : 20].head(3)

In [None]:
train_df.iloc[: , 21 : 30].head(3)

In [None]:
train_df.iloc[: , 31 : 34].head(3)

In [None]:
# 1) amount

data = train_df['amount']
data = data.str.replace('$' , '').str.replace(',' , '').astype(float)

train_df['amount'] = data

In [None]:
test_df['amount'] = test_df['amount'].str.replace('$' , '').str.replace(',' , '').astype(float)

In [None]:
# 2) credit_limit

data = train_df['credit_limit']
data = data.str.replace('$' , '').str.replace(',' , '').astype(int)

train_df['credit_limit'] = data

In [None]:
test_df['credit_limit'] = test_df['credit_limit'].str.replace('$' , '').str.replace(',' , '').astype(int)

In [None]:
# 3) per_capita_income_zipcode

data = train_df['per_capita_income_zipcode']
data = data.str.replace('$' , '').str.replace(',' , '').astype(int)

train_df['per_capita_income_zipcode'] = data

In [None]:
test_df['per_capita_income_zipcode'] = test_df['per_capita_income_zipcode'].str.replace('$' , '').str.replace(',' , '').astype(int)

In [None]:
# 4) total_debt

data = train_df['total_debt']
data = data.str.replace('$' , '').str.replace(',' , '').astype(int)

train_df['total_debt'] = data

In [None]:
test_df['total_debt'] = test_df['total_debt'].str.replace('$' , '').str.replace(',' , '').astype(int)

In [None]:
# 4-2 yearly_income_person

data = train_df['yearly_income_person']
data = data.str.replace('$' , '').str.replace(',' , '').astype(int)

train_df['yearly_income_person'] = data

In [None]:
test_df['yearly_income_person'] = test_df['yearly_income_person'].str.replace('$' , '').str.replace(',' , '').astype(int)

In [None]:
# 5) expires

data = train_df['expires']

data = pd.DataFrame({
    'Month' : data.str.replace('/' , '').str[:2] ,
    'Year' : data.str.replace('/' , '').str[2:]
})

data['is_fraud?'] = target

# 2021 , 2022 , 2023

year_2021 = data[data['Year'] == '2021']
year_2022 = data[data['Year'] == '2022']
year_2023 = data[data['Year'] == '2023']

In [None]:
def percentage_calculation(df , cols):
    for year in cols:
        percentage = round((len(df[(df['Month'] == year) & (df['is_fraud?'] == 1)]) / len(df[df['Month'] == year])) * 100 , 2)
        print(f'{year} Percentage (1) : {percentage}%\n')

# 년도 시각화

count = year_2021.groupby(['Month'])['is_fraud?'].count()

fig = px.histogram(
    x = year_2021['Month'].sort_values() ,
    color = year_2021['is_fraud?'] ,
    text_auto = True
)

fig.update_xaxes(title = 'Month')

fig.update_layout(
    title = '2021 expires count' ,
    title_font_size = 20
)

fig.show()

In [None]:
percentage_calculation(year_2021 , sorted(year_2021['Month'].unique()))

In [None]:
count = year_2022.groupby(['Month'])['is_fraud?'].count()

fig = px.histogram(
    x = year_2022['Month'].sort_values() ,
    color = year_2022['is_fraud?'] ,
    text_auto = True
)

fig.update_xaxes(title = 'Month')

fig.update_layout(
    title = '2022 expires count' ,
    title_font_size = 20
)

fig.show()

In [None]:
percentage_calculation(year_2022 , sorted(year_2022['Month'].unique()))

In [None]:
count = year_2023.groupby(['Month'])['is_fraud?'].count()

fig = px.histogram(
    x = year_2023['Month'].sort_values() ,
    color = year_2023['is_fraud?'] ,
    text_auto = True
)

fig.update_xaxes(title = 'Month')

fig.update_layout(
    title = '2023 expires count' ,
    title_font_size = 20
)

fig.show()

In [None]:
percentage_calculation(year_2023 , sorted(year_2023['Month'].unique()))

In [None]:
data = train_df['expires']

data = pd.DataFrame({
    'Month' : data.str.replace('/' , '').str[:2] ,
    'Year' : data.str.replace('/' , '').str[2:]
})

data['is_fraud?'] = target

year_data = data.groupby(['Year'])['is_fraud?'].count().reset_index()

fig = px.line(
    year_data ,
    x = 'Year' ,
    y = 'is_fraud?'
)

fig.update_layout(
    title = 'Year is_fraud? count Line plot' ,
    title_font_size = 20
)

fig.show()

In [None]:
train_df['expires'] = train_df['expires'].str.replace('/' , '').str[2:]
train_df['expires'] = train_df['expires'].astype('int')

In [None]:
test_df['expires'] = test_df['expires'].str.replace('/' , '').str[2:]
test_df['expires'] = test_df['expires'].astype('int')

In [None]:
# 6) acct_open_date

train_df['acct_open_date'].head(10)

data = pd.DataFrame({
    'Month' : train_df['acct_open_date'].str[:2] ,
    'Year' : train_df['acct_open_date'].str[3:]
})

data['is_fraud?'] = target
data

In [None]:
data.groupby(['Year'])['is_fraud?'].count().sort_values(ascending = False)

year_2007 = data[data['Year'] == '2007']
year_2005 = data[data['Year'] == '2005']
year_2010 = data[data['Year'] == '2010']

In [None]:
fig = px.histogram(
    x = year_2005['Month'].sort_values() ,
    color = year_2005['is_fraud?'] ,
    text_auto = True
)

fig.update_xaxes(title = 'Month')
fig.update_layout(
    title = '2005 open date' ,
    title_font_size = 20
)

fig.show()

In [None]:
fig = px.histogram(
    x = year_2007['Month'].sort_values() ,
    color = year_2007['is_fraud?'] ,
    text_auto = True
)

fig.update_xaxes(title = 'Month')
fig.update_layout(
    title = '2007 open date' ,
    title_font_size = 20
)

fig.show()

In [None]:
fig = px.histogram(
    x = year_2010['Month'].sort_values() ,
    color = year_2010['is_fraud?'] ,
    text_auto = True
)

fig.update_xaxes(title = 'Month')
fig.update_layout(
    title = '2010 open date' ,
    title_font_size = 20
)

fig.show()

In [None]:
data = train_df['acct_open_date']

data = pd.DataFrame({
    'Month' : data.str.replace('/' , '').str[:2] ,
    'Year' : data.str.replace('/' , '').str[2:]
})

data['is_fraud?'] = target

year_data = data.groupby(['Year'])['is_fraud?'].count().reset_index()

fig = px.line(
    year_data ,
    x = 'Year' ,
    y = 'is_fraud?'
)

fig.update_layout(
    title = 'Year is_fraud? count Line plot' ,
    title_font_size = 20
)

fig.show()

In [None]:
train_df['acct_open_date'] = train_df['acct_open_date'].str.replace('/' , '').str[2:]
train_df['acct_open_date'] = train_df['acct_open_date'].astype('int')

In [None]:
test_df['acct_open_date'] = test_df['acct_open_date'].str.replace('/' , '').str[2:]
test_df['acct_open_date'] = test_df['acct_open_date'].astype('int')

In [None]:
train_df.info()

In [None]:
object_train_df = train_df.select_dtypes('object')

unique_arr = []
for col in object_train_df.columns:
    unique_arr.append(len(object_train_df[col].unique()))

data = pd.DataFrame({
    "cols" : object_train_df.columns ,
    "unique" : unique_arr
})

data = data.sort_values(by = ['unique'] , ascending = False)

data

In [None]:
object_train_df.head(10)

In [None]:
object_train_df['city_true'] = (object_train_df['merchant_city'] == object_train_df['city'])
train_df['city_true'] = (train_df['merchant_city'] == train_df['city'])
test_df['city_true'] = (test_df['merchant_city'] == test_df['city'])

In [None]:
object_train_df['state_true'] = (object_train_df['merchant_state'] == object_train_df['state'])
train_df['state_true'] = (train_df['merchant_state'] == train_df['state'])
test_df['state_true'] = (test_df['merchant_state'] == test_df['state'])

In [None]:
object_train_df.drop(['merchant_city' , 'merchant_state'] , axis = 1 , inplace = True)
train_df.drop(['merchant_city' , 'merchant_state'] , axis = 1 , inplace = True)
test_df.drop(['merchant_city' , 'merchant_state'] , axis = 1 , inplace = True)

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
object_train_df['errors?'] = encoder.fit_transform(object_train_df['errors?'])
object_train_df['has_chip'] = encoder.fit_transform(object_train_df['has_chip'])
object_train_df['gender'] = encoder.fit_transform(object_train_df['gender'])
object_train_df['city_true'] = encoder.fit_transform(object_train_df['city_true'])
object_train_df['state_true'] = encoder.fit_transform(object_train_df['state_true'])

train_df['errors?'] = encoder.fit_transform(train_df['errors?'])
train_df['has_chip'] = encoder.fit_transform(train_df['has_chip'])
train_df['gender'] = encoder.fit_transform(train_df['gender'])
train_df['city_true'] = encoder.fit_transform(train_df['city_true'])
train_df['state_true'] = encoder.fit_transform(train_df['state_true'])

test_df['errors?'] = encoder.fit_transform(test_df['errors?'])
test_df['has_chip'] = encoder.fit_transform(test_df['has_chip'])
test_df['gender'] = encoder.fit_transform(test_df['gender'])
test_df['city_true'] = encoder.fit_transform(test_df['city_true'])
test_df['state_true'] = encoder.fit_transform(test_df['state_true'])

In [None]:
object_train_df['use_chip'] = encoder.fit_transform(object_train_df['use_chip'])
object_train_df = pd.concat([object_train_df , pd.get_dummies(object_train_df['card_brand'])] , axis = 1)
object_train_df.drop(['card_brand'] , axis = 1 , inplace = True)
object_train_df['card_type'] = encoder.fit_transform(object_train_df['card_type'])

train_df['use_chip'] = encoder.fit_transform(train_df['use_chip'])
train_df = pd.concat([train_df , pd.get_dummies(train_df['card_brand'])] , axis = 1)
train_df.drop(['card_brand'] , axis = 1 , inplace = True)
train_df['card_type'] = encoder.fit_transform(train_df['card_type'])

test_df['use_chip'] = encoder.fit_transform(test_df['use_chip'])
test_df = pd.concat([test_df , pd.get_dummies(test_df['card_brand'])] , axis = 1)
test_df.drop(['card_brand'] , axis = 1 , inplace = True)
test_df['card_type'] = encoder.fit_transform(test_df['card_type'])

In [None]:
!pip install category_encoders

In [None]:
from category_encoders.target_encoder import TargetEncoder

encoder = TargetEncoder()

In [None]:
import re

object_train_df['address'] = object_train_df['address'].apply(lambda x : re.sub(r'\D' , '' , x))
object_train_df['address'] = object_train_df['address'].astype(int)

train_df['address'] = train_df['address'].apply(lambda x : re.sub(r'\D' , '' , x))
train_df['address'] = train_df['address'].astype(int)

test_df['address'] = test_df['address'].apply(lambda x : re.sub(r'\D' , '' , x))
test_df['address'] = test_df['address'].astype(int)

In [None]:
object_train_df['city'] = encoder.fit_transform(object_train_df['city'] , target)
object_train_df['state'] = encoder.fit_transform(object_train_df['state'] , target)

train_df['city'] = encoder.fit_transform(train_df['city'] , target)
test_df['city'] = encoder.transform(test_df['city'])
train_df['state'] = encoder.fit_transform(train_df['state'] , target)
test_df['state'] = encoder.transform(test_df['state'])
train_df['address'] = encoder.fit_transform(train_df['address'] , target)
test_df['address'] = encoder.transform(test_df['address'])

### 2. EDA

In [None]:
msno.bar(train_df)

train_df['zip'] = train_df['zip'].fillna(train_df['zip'].mean())

In [None]:
msno.bar(test_df)

test_df['zip'] = test_df['zip'].fillna(test_df['zip'].mean())

In [None]:
train_df.info()

In [None]:
fig = plt.figure(figsize = (20 , 20))

fig = sns.heatmap(train_df.corr() , cmap = 'YlGnBu' , annot = True , fmt = '.1f')

In [None]:
train_df.drop(['zip' , 'zipcode' , 'longitude' , 'birth_year' , 'per_capita_income_zipcode' , 'Mastercard'] , axis = 1 , inplace = True)
test_df.drop(['zip' , 'zipcode' , 'longitude' , 'birth_year' , 'per_capita_income_zipcode' , 'Mastercard'] , axis = 1 , inplace = True)

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

smote = SMOTE()

train_df , target = smote.fit_resample(train_df , target)

In [None]:
train_df.iloc[: , 0:10]

In [None]:
train_df.iloc[: , 10:20]

In [None]:
train_df.iloc[: , 20:32]

In [None]:
data = train_df[['expires' , 'acct_open_date' , 'year_pin_last_changed' , 'current_age' , 'retirement_age']]

fig , ax = plt.subplots(ncols = 3 , nrows = 2 , figsize = (15 , 8))

fig = sns.countplot(
    data = data ,
    x = 'expires' ,
    ax = ax[0][0]
)
ax[0][0].tick_params(labelrotation = 45)

fig = sns.countplot(
    data = data ,
    x = 'acct_open_date' ,
    ax = ax[0][1]
)
ax[0][1].tick_params(labelrotation = 45)

fig = sns.countplot(
    data = data ,
    x = 'year_pin_last_changed' ,
    ax = ax[0][2]
)
ax[0][2].tick_params(labelrotation = 45)

fig = sns.countplot(
    data = data ,
    x = 'current_age' ,
    ax = ax[1][0]
)
ax[1][0].tick_params(labelrotation = 45)

fig = sns.countplot(
    data = data ,
    x = 'retirement_age' ,
    ax = ax[1][1]
)
ax[1][1].tick_params(labelrotation = 45)

In [None]:
# binning

# 1) expires
expires = pd.qcut(train_df['expires'] , q = 3 , labels = [0 , 1 , 2])
train_df['expires'] = expires

expires = pd.qcut(test_df['expires'] , q = 3 , labels = [0 , 1 , 2])
test_df['expires'] = expires

# 2) acct_open_date
acct = pd.qcut(train_df['acct_open_date'] , q = 3 , labels = [0 , 1 , 2])
train_df['acct_open_date'] = acct

acct = pd.qcut(test_df['acct_open_date'] , q = 3 , labels = [0 , 1 , 2])
test_df['acct_open_date'] = acct

# 3) year_pin_last_changed
year = pd.qcut(train_df['year_pin_last_changed'] , q = 3 , labels = [0 , 1 , 2])
train_df['year_pin_last_changed'] = year

year = pd.qcut(test_df['year_pin_last_changed'] , q = 3 , labels = [0 , 1 , 2])
test_df['year_pin_last_changed'] = year

# 4) current_age
c_age = pd.qcut(train_df['current_age'] , q = 5 , labels = [0 , 1 , 2 , 3 , 4])
train_df['current_age'] = c_age

c_age = pd.qcut(test_df['current_age'] , q = 5 , labels = [0 , 1 , 2 , 3 , 4])
test_df['current_age'] = c_age

# 5) retirement_age
r_age = pd.qcut(train_df['retirement_age'] , q = 3 , labels = [0 , 1 , 2])
train_df['retirement_age'] = r_age

r_age = pd.qcut(test_df['retirement_age'] , q = 3 , labels = [0 , 1 , 2])
test_df['retirement_age'] = r_age

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
columns = ['amount' , 'mcc' , 'credit_limit' , 'latitude' , 'yearly_income_person' , 'total_debt' , 'fico_score']

train_df.drop(['user_id' , 'card_id' , 'merchant_id'] , axis = 1 , inplace = True)
test_df.drop(['user_id' , 'card_id' , 'merchant_id'] , axis = 1 , inplace = True)

train_df[columns] = scaler.fit_transform(train_df[columns])
test_df[columns] = scaler.fit_transform(test_df[columns])

In [None]:
train_df = pd.DataFrame(train_df)
test_df = pd.DataFrame(test_df)

### 3. Modeling

In [None]:
X_train , X_test , Y_train , Y_test = train_test_split(train_df , target , random_state = 42 , test_size = 0.3)

In [None]:
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier , ExtraTreesClassifier , VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV , cross_val_score , learning_curve , StratifiedKFold
from sklearn.metrics import accuracy_score , f1_score , precision_score , recall_score

In [None]:
kfold = StratifiedKFold(n_splits = 10)

In [None]:
data = pd.concat([X_train , Y_train] , axis = 1)

sampling_data = data.sample(n = 10000)

sampleY = sampling_data['is_fraud?']
sampleX = sampling_data.drop(['is_fraud?'] , axis = 1)

In [None]:
random_state = 42
classifiers = []
classifiers.append(SVC(random_state = random_state))
classifiers.append(DecisionTreeClassifier(random_state = random_state))
classifiers.append(AdaBoostClassifier(random_state = random_state))
classifiers.append(RandomForestClassifier(random_state = random_state))
classifiers.append(ExtraTreesClassifier(random_state = random_state))
classifiers.append(GradientBoostingClassifier(random_state = random_state))
classifiers.append(MLPClassifier(random_state = random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))

cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier , sampleX , y = sampleY , scoring = 'accuracy' , cv = kfold , n_jobs = 4))

cv_means = []
cv_std = []

for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({
    "CrossValMeans" : cv_means ,
    "CrossValerrors" : cv_std ,
    "Algorithm" : [
        'SVC' , 'DecisionTree' , 'AdaBoost' , 'RandomForest' , 'ExtraTrees' , 'GradientBoosting' ,
        'MultipleLayerPerceptron' , 'KNeighboors' , 'LogisticRegression'
    ]
})

g = sns.barplot(x = 'CrossValMeans' , y = 'Algorithm' , data = cv_res , palette = 'Set3' , orient = 'h' , **{'xerr' : cv_std})
plt.xlabel('Mean Accuracy')
plt.title('Cross validation scores')

In [None]:
DTC = DecisionTreeClassifier()

adaDTC = AdaBoostClassifier(DTC , random_state = 7)

ada_param_grid = {
    'base_estimator__criterion' : ['gini' , 'entropy'] ,
    'base_estimator__splitter' : ['best' , 'random'] ,
    'algorithm' : ['SAMME' , 'SAMME.R'] ,
    'n_estimators' : [1 , 3] ,
    'learning_rate' : [0.0001 , 0.001 , 0.01 , 0.1 , 0.2 , 0.3 , 1.5]
}

gsadaDTC = GridSearchCV(adaDTC , param_grid = ada_param_grid , cv = kfold , scoring = 'accuracy' ,
                        n_jobs = 4 , verbose = 1)

gsadaDTC.fit(sampleX , sampleY)

ada_best = gsadaDTC.best_estimator_

In [None]:
gsadaDTC.best_score_

In [None]:
ExtC = ExtraTreesClassifier()

ex_param_grid = {
    'max_depth' : [None] ,
    'max_features' : [1 , 3 , 10] ,
    'min_samples_split' : [2 , 3 , 10] ,
    'min_samples_leaf' : [1 , 3 , 10] ,
    'bootstrap' : [False] ,
    'n_estimators' : [100 , 300] ,
    'criterion' : ['gini']
}

gsExtC = GridSearchCV(ExtC , param_grid = ex_param_grid , cv = kfold , scoring = 'accuracy' ,
                      n_jobs = 4 , verbose = 1)

gsExtC.fit(sampleX , sampleY)

ExtC_best = gsExtC.best_estimator_

In [None]:
gsExtC.best_score_

In [None]:
RFC = RandomForestClassifier()

rf_param_grid = {
    'max_depth' : [None] ,
    'max_features' : [1 , 3 , 10] ,
    'min_samples_split' : [2 , 3 , 10] ,
    'min_samples_leaf' : [1 , 3 , 10] ,
    'bootstrap' : [False] ,
    'n_estimators' : [100 , 300] ,
    'criterion' : ['gini']
}

gsRFC = GridSearchCV(RFC , param_grid = rf_param_grid , cv = kfold , scoring = 'accuracy' ,
                     n_jobs = 4 , verbose = 1)

gsRFC.fit(sampleX , sampleY)

RFC_best = gsRFC.best_estimator_

In [None]:
gsRFC.best_score_

In [None]:
GBC = GradientBoostingClassifier()

gb_param_grid = {
    'loss' : ['deviance'] ,
    'n_estimators' : [100 , 200 , 300] ,
    'learning_rate' : [0.1 , 0.05 , 0.01] ,
    'max_depth' : [4 , 8] ,
    'min_samples_leaf' : [100 , 150] ,
    'max_features' : [0.3 , 0.1]
}

gsGBC = GridSearchCV(GBC , param_grid = gb_param_grid , cv = kfold , scoring = 'accuracy' ,
                     n_jobs = 4 , verbose = 1)

gsGBC.fit(sampleX , sampleY)

GBC_best = gsGBC.best_estimator_

In [None]:
gsGBC.best_score_

In [None]:
def plot_learning_curve(estimator , title , X , y , ylim = None , cv = None ,
                        n_jobs = -1 , train_sizes = np.linspace(.1 , 1.0 , 5)):

                        plt.figure()
                        plt.title(title)

                        if ylim is not None:
                            plt.ylim(*ylim)

                        plt.xlabel('Training examples')
                        plt.ylabel('Score')

                        train_sizes , train_scores , test_scores = learning_curve(
                            estimator , X , y , cv = cv , n_jobs = n_jobs , train_sizes = train_sizes
                        )
                        train_scores_mean = np.mean(train_scores , axis = 1)
                        train_scores_std = np.std(train_scores , axis = 1)
                        test_scores_mean = np.mean(test_scores , axis = 1)
                        test_scores_std = np.std(test_scores , axis = 1)

                        plt.grid()

                        plt.fill_between(train_sizes , train_scores_mean - train_scores_std ,
                                         train_scores_mean + train_scores_std , alpha = 0.1 ,
                                         color = 'r')
                        plt.fill_between(train_sizes , test_scores_mean - test_scores_std ,
                                         test_scores_mean + test_scores_std , alpha = 0.1 ,
                                         color = 'g')

                        plt.plot(train_sizes , train_scores_mean , 'o-' , color = 'r' ,
                                 label = 'Training score')

                        plt.plot(train_sizes , test_scores_mean , 'o-' , color = 'g' ,
                                 label = 'Cross-validation score')

                        plt.legend(loc = 'best')

                        return plt

g = plot_learning_curve(gsRFC.best_estimator_ , 'RF mearning curves' , sampleX , sampleY , cv = kfold)
g = plot_learning_curve(gsExtC.best_estimator_ , 'ExtraTrees learning curves' , sampleX , sampleY , cv = kfold)
g = plot_learning_curve(gsadaDTC.best_estimator_ , 'AdaBoost learning curves' , sampleX , sampleY , cv = kfold)
g = plot_learning_curve(gsGBC.best_estimator_ , 'GradientBoosting learning curves' , sampleX , sampleY , cv = kfold)

In [None]:
votingC = VotingClassifier(estimators = [('rfc' , RFC_best) ,
                                         ('extc' , ExtC_best) ,
                                         ('adac' , ada_best) ,
                                         ('gbc' , GBC_best)] ,
                           voting = 'soft' , n_jobs = 4)

votingC = votingC.fit(sampleX , sampleY)

In [None]:
prediction = votingC.predict(X_test)

In [None]:
print(f"r-squared score : {votingC.score(X_train , Y_train)}")
print(f"r-squared score (test) : {votingC.score(X_test , Y_test)}")

In [None]:
def metrics(y_true , pred):

    print(f'Accuracy : {accuracy_score(y_true , pred)}\n')
    print(f'f1 : {f1_score(y_true , pred)}\n')
    print(f'precision : {precision_score(y_true , pred)}\n')
    print(f'recall : {recall_score(y_true , pred)}')

In [None]:
metrics(Y_test , prediction)

### Test

In [None]:
test_df.head(10)

In [None]:
prediction = votingC.predict(test_df)

In [None]:
submission = pd.read_csv(path + "sample_submit.csv" , header = None)

In [None]:
submission[1] = prediction

In [None]:
submission.to_csv(path + 'submission.csv' , header = None , index = False)

In [None]:
submission