In [28]:
import pandas as pd
import numpy as np

data = {
    "Refund":["Yes","Yes","No","No","Yes","No","No","Yes","No"],
    "Status":["Single","Single","Single","Single","Single","Married","Married","Married","Single"],
    "Tax Income":[125,80,75,65,60,120,80,90,95],
    "Cheat":["No","Yes","No","Yes","No","Yes","No","Yes","Yes"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Refund,Status,Tax Income,Cheat
0,Yes,Single,125,No
1,Yes,Single,80,Yes
2,No,Single,75,No
3,No,Single,65,Yes
4,Yes,Single,60,No
5,No,Married,120,Yes
6,No,Married,80,No
7,Yes,Married,90,Yes
8,No,Single,95,Yes


In [29]:
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]
print(X.head(1))
print(Y.head(1))

  Refund  Status  Tax Income
0    Yes  Single         125
0    No
Name: Cheat, dtype: object


## vectorization

In [30]:
X.Refund = X.Refund.map({"Yes":1, "No":2})
X.Status = X.Status.map({"Single":1, "Married":2})

In [31]:
X.head(1)

Unnamed: 0,Refund,Status,Tax Income
0,1,1,125


In [27]:
Y = pd.get_dummies(Y)
Y.head(2)

Unnamed: 0,No,Yes
0,1,0
1,0,1


In [6]:
from sklearn.model_selection import train_test_split
#split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1, stratify=Y)




from sklearn.neighbors import KNeighborsClassifier
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 3)
# Fit the classifier to the data
knn.fit(X_train,y_train)
Y_predict = knn.predict(X_test)


predict = pd.DataFrame(data=Y_predict, columns=["No","Yes"])
y_test = pd.DataFrame(data=y_test, columns=["No","Yes"])

def abc(a, b):
    if a == 0:
        return 2
    else:
        return 1
    
predict['p'] = predict[["No","Yes"]].apply(lambda x: abc(*x), axis=1)
y_test['p']=y_test[["No","Yes"]].apply(lambda x: abc(*x), axis=1)

In [7]:
predict

Unnamed: 0,No,Yes,p
0,1,0,1
1,0,1,2
2,0,1,2


In [8]:
y_test

Unnamed: 0,No,Yes,p
2,1,0,1
1,0,1,2
7,0,1,2


In [9]:
from sklearn.metrics import f1_score

In [10]:
f1_score(y_test.p.values, predict.p.values, average='micro')

1.0

# K-NN with 3-Fold Cross Validation

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

import numpy as np
#create a new KNN model
knn_cv = KNeighborsClassifier(n_neighbors=3)
#train model with cv of 5 
cv_scores = cross_val_score(knn_cv, X, Y, cv=3)
cv_scores1 = cross_val_score(knn_cv, X, Y, cv=3,scoring='f1_macro')
#print each cv score (accuracy) and average them
print("Accuracy")
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))
print("F1-Score")
print(cv_scores1)
print('cv_scores mean:{}'.format(np.mean(cv_scores1)))

Accuracy
[0.66666667 0.66666667 0.66666667]
cv_scores mean:0.6666666666666666
F1-Score
[0.66666667 0.66666667 0.4       ]
cv_scores mean:0.5777777777777778


  'precision', 'predicted', average, warn_for)


# Naive Bayes Classification using Scikit-learn

In [32]:
from sklearn import preprocessing
#creating labelEncoder
le = preprocessing.LabelEncoder()
label = le.fit_transform(Y)
label

array([0, 1, 0, 1, 0, 1, 0, 1, 1])

In [33]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

import numpy as np
#create a new KNN model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB(X, label)

cv_scores = cross_val_score(knn_cv, X, label, cv=3)
cv_scores1 = cross_val_score(knn_cv, X, label, cv=3,scoring='f1_macro')
#print each cv score (accuracy) and average them
print("Accuracy")
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))
print("F1-Score")
print(cv_scores1)
print('cv_scores mean:{}'.format(np.mean(cv_scores1)))


Accuracy
[0.5        0.33333333 0.5       ]
cv_scores mean:0.4444444444444444
F1-Score
[0.5        0.25       0.33333333]
cv_scores mean:0.3611111111111111


  'precision', 'predicted', average, warn_for)


# CART

In [34]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

cv_scores = cross_val_score(clf, X, label, cv=3)
cv_scores1 = cross_val_score(clf, X, label, cv=3,scoring='f1_macro')
#print each cv score (accuracy) and average them
print("Accuracy")
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))
print("F1-Score")
print(cv_scores1)
print('F1_scores mean:{}'.format(np.mean(cv_scores1)))


Accuracy
[0.25 0.   0.5 ]
cv_scores mean:0.25
F1-Score
[0.2        0.         0.33333333]
F1_scores mean:0.17777777777777778


  'precision', 'predicted', average, warn_for)
