# Importing libraries and setup

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import random 
from sklearn.model_selection import train_test_split

random.seed(1234)
data = pd.read_csv("card.csv",skiprows=[1],index_col=[0])

## Data Exploration

In [2]:
data = pd.read_csv("card.csv",skiprows=[1],index_col=[0])

In [None]:
#First 5 rows
data.head()

In [None]:
#Last 5 rows
data.tail()

In [6]:
data.shape

(30000, 24)

Data has 30000 rows and 24 cols 

In [None]:
#Summary statistics
data.describe()

In [4]:
#Check if any data is missing
data.isnull().any().sum()

0

There is no missing values

## Data pre-processing

In [3]:
#Creating X & Y dataframes
# Train test split

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1234)


In [4]:
#Scale the data 

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Feature selection 

In [None]:
!pip install mlxtend
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 1234)
classifier.fit(X_train, y_train)

In [7]:
#forward selection
sfs1 = sfs(classifier, 
           k_features='best', 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   33.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed: 13.5min finished

[2022-11-07 18:37:03] Features: 1/23 -- score: 0.8210222222222222[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 67.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed: 76.3min finished

[2022-11-07 19:53:20] Features: 2/23 -- score: 0.8220444444444445[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   27.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:  9.0min finished

[2022-11-07 20:02:22] Features: 3/23 -- score: 0.8227111111111111[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

In [8]:
sfs1.k_feature_idx_

(0, 1, 2, 3, 4, 5, 6, 7, 9, 10)

Column 0,1,2,3,4,5,6,7,9,10 are the best features

In [9]:
sfs1.k_score_

0.8280888888888889

In [15]:
#Filter the data that contain the best features
new_data = data.loc[:,['X1','X2','X3','X4','X5','X6','X7','X9','X10','Y']]

In [22]:
X_new = new_data.iloc[:, :-1].values
y_new = new_data.iloc[:, -1].values

X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new, y_new, test_size = 0.25, random_state = 1234)

In [None]:
#Scale the data 

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_new_train = sc.fit_transform(X_new_train)
X_new_test = sc.transform(X_new_test)

## Model selection & Evaluation

### Support vector machine

In [24]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 1234)
classifier.fit(X_new_train, y_new_train)

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_new = classifier.predict(X_new_test)
cm_new = confusion_matrix(y_new_test, y_pred_new)
print(cm_new)
accuracy_score(y_new_test, y_pred_new)

[[5566  234]
 [1133  567]]


0.8177333333333333

81.773%

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 1234)
classifier.fit(X_new_train, y_new_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_new = classifier.predict(X_new_test)
cm_new = confusion_matrix(y_new_test, y_pred_new)
print(cm_new)
accuracy_score(y_new_test, y_pred_new)

### Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 1234)
classifier.fit(X_new_train, y_new_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_new_test)
cm = confusion_matrix(y_new_test, y_pred)
print(cm)
accuracy_score(y_new_test, y_pred)

### Decision tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 1234)
classifier.fit(X_new_train, y_new_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_new_test)
cm = confusion_matrix(y_new_test, y_pred)
print(cm)
accuracy_score(y_new_test, y_pred)

### K nearest neigbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_new_train, y_new_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_new_test)
cm = confusion_matrix(y_new_test, y_pred)
print(cm)
accuracy_score(y_new_test, y_pred)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_new_train, y_bew_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_new_test)
cm = confusion_matrix(y_new_test, y_pred)
print(cm)
accuracy_score(y_new_test, y_pred)

## Extra code for ROC, PCA

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc,RocCurveDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score


In [None]:
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=classifier.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
gmeans = np.sqrt(tpr * (1-fpr))

In [None]:
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 10)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
