# DATA PREPARATION

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('train.csv')

In [6]:
df.shape

(1372, 5)

In [7]:
df.head()

Unnamed: 0,variance,skewness,curtosis,entropy,target
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [8]:
df.tail()

Unnamed: 0,variance,skewness,curtosis,entropy,target
1367,0.40614,1.3492,-1.4501,-0.55949,1
1368,-1.3887,-4.8773,6.4774,0.34179,1
1369,-3.7503,-13.4586,17.5932,-2.7771,1
1370,-3.5637,-8.3827,12.393,-1.2823,1
1371,-2.5419,-0.65804,2.6842,1.1952,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1372 entries, 0 to 1371
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   variance  1372 non-null   float64
 1   skewness  1372 non-null   float64
 2   curtosis  1372 non-null   float64
 3   entropy   1372 non-null   float64
 4   target    1372 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


In [10]:
df.isnull().sum()

variance    0
skewness    0
curtosis    0
entropy     0
target      0
dtype: int64

In [12]:
x = df.drop(columns = ['target'])
y = df.target

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=36)

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_new = scaler.fit_transform(X_train)

# EDA 

In [22]:
df.describe()

Unnamed: 0,variance,skewness,curtosis,entropy,target
count,1372.0,1372.0,1372.0,1372.0,1372.0
mean,0.433735,1.922353,1.397627,-1.191657,0.444606
std,2.842763,5.869047,4.31003,2.101013,0.497103
min,-7.0421,-13.7731,-5.2861,-8.5482,0.0
25%,-1.773,-1.7082,-1.574975,-2.41345,0.0
50%,0.49618,2.31965,0.61663,-0.58665,0.0
75%,2.821475,6.814625,3.17925,0.39481,1.0
max,6.8248,12.9516,17.9274,2.4495,1.0


In [23]:
df.duplicated().sum()

24

In [24]:
df.nunique()

variance    1338
skewness    1256
curtosis    1270
entropy     1156
target         2
dtype: int64

In [25]:
df.target.value_counts()

0    762
1    610
Name: target, dtype: int64

In [27]:
df.corr(method = 'pearson')

Unnamed: 0,variance,skewness,curtosis,entropy,target
variance,1.0,0.264026,-0.38085,0.276817,-0.724843
skewness,0.264026,1.0,-0.786895,-0.526321,-0.444688
curtosis,-0.38085,-0.786895,1.0,0.318841,0.155883
entropy,0.276817,-0.526321,0.318841,1.0,-0.023424
target,-0.724843,-0.444688,0.155883,-0.023424,1.0


# DATA MODELLING

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [60]:
models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(),AdaBoostClassifier(n_estimators=300000,learning_rate=100000),GradientBoostingClassifier(n_estimators=10000,max_depth=3,learning_rate=9000,random_state=36)]

In [54]:
for model in models:
    model.fit(X_train_new,y_train)
    print(f'Accuracy score of {model} is : {model.score(X_test,y_test)}')   

  sample_weight *= np.exp(
  model.fit(X_train_new,y_train)


Accuracy score of LogisticRegression() is : 0.8188405797101449
Accuracy score of DecisionTreeClassifier() is : 0.7971014492753623
Accuracy score of RandomForestClassifier() is : 0.7971014492753623
Accuracy score of AdaBoostClassifier(learning_rate=100000, n_estimators=300000) is : 0.8188405797101449
Accuracy score of GradientBoostingClassifier(learning_rate=9000, n_estimators=10000,
                           random_state=36) is : 0.8333333333333334




In [77]:
# HyperParameter Tuning
from sklearn.model_selection import GridSearchCV

param = {'n_estimators':range(10,100,10)}

classifier = GridSearchCV(GradientBoostingClassifier(),param,cv=5)

In [78]:
classifier.fit(x,y)

In [79]:
classifier.best_score_

0.9948958195089581

In [80]:
classifier.best_params_

{'n_estimators': 90}

In [81]:
predictor = GradientBoostingClassifier(n_estimators = 90)

In [84]:
predictor.fit(X_train,y_train)

In [85]:
predictor.score(X_test,y_test)

0.9927536231884058

In [86]:
import pickle 

with open('BanknoteAuthentication','wb') as f:
    pickle.dump(predictor,f)