In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import tree #https://scikit-learn.org/stable/modules/tree.html
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis #https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html
import math

In [2]:
file_raw_data = "citrus.csv"

raw_data = pd.read_csv(file_raw_data)

raw_data

Unnamed: 0,name,diameter,weight,red,green,blue
0,orange,2.96,86.76,172,85,2
1,orange,3.91,88.05,166,78,3
2,orange,4.42,95.17,156,81,2
3,orange,4.47,95.60,163,81,4
4,orange,4.48,95.76,161,72,9
...,...,...,...,...,...,...
9995,grapefruit,15.35,253.89,149,77,20
9996,grapefruit,15.41,254.67,148,68,7
9997,grapefruit,15.59,256.50,168,82,20
9998,grapefruit,15.92,260.14,142,72,11


In [3]:
x = raw_data.iloc[:,1:5] #Gives us all our X. Column 1 is the W in f(x) = w1x1 + w2x2 + wnxn

x

Unnamed: 0,diameter,weight,red,green
0,2.96,86.76,172,85
1,3.91,88.05,166,78
2,4.42,95.17,156,81
3,4.47,95.60,163,81
4,4.48,95.76,161,72
...,...,...,...,...
9995,15.35,253.89,149,77
9996,15.41,254.67,148,68
9997,15.59,256.50,168,82
9998,15.92,260.14,142,72


In [4]:
num_features = len(x.columns) #To check how many columns (attributes) we have, in this case diameter, weight, red, green, and blue.

print("Number of Features: {}".format(num_features))

Number of Features: 4


In [5]:
x_mean = x.mean()
x_std = x.std()
x_standardized = (x - x_mean)/x_std

x_standardized

Unnamed: 0,diameter,weight,red,green
0,-3.601770,-3.022403,1.739891,0.767771
1,-3.114051,-2.978243,1.164790,0.169912
2,-2.852223,-2.734509,0.206289,0.426137
3,-2.826554,-2.719789,0.877240,0.426137
4,-2.821420,-2.714312,0.685539,-0.342539
...,...,...,...,...
9995,2.759110,2.698853,-0.464662,0.084503
9996,2.789913,2.725554,-0.560512,-0.684174
9997,2.882323,2.788199,1.356490,0.511546
9998,3.051741,2.912805,-1.135613,-0.342539


In [6]:
# Scaling things from 0 - 1
x_normalized = (x - x.min()) / (x.max() - x.min())

x_normalized

Unnamed: 0,diameter,weight,red,green
0,0.000000,0.000000,0.740260,0.635294
1,0.070423,0.007382,0.662338,0.552941
2,0.108228,0.048126,0.532468,0.588235
3,0.111935,0.050587,0.623377,0.588235
4,0.112676,0.051502,0.597403,0.482353
...,...,...,...,...
9995,0.918458,0.956395,0.441558,0.541176
9996,0.922906,0.960858,0.428571,0.435294
9997,0.936249,0.971330,0.688312,0.600000
9998,0.960712,0.992160,0.350649,0.482353


In [7]:
y = raw_data['name'].replace(['orange'], 0).replace(['grapefruit'], 1) 
#y is our answer/classification, so it's not part of how many dimensions we have in this table

# Can use column name (string) to call column. Source: https://dataindependent.com/pandas/keyerror-pandas-how-to-fix/

# y = y.values

y

0       0
1       0
2       0
3       0
4       0
       ..
9995    1
9996    1
9997    1
9998    1
9999    1
Name: name, Length: 10000, dtype: int64

In [8]:
num_orange = len(raw_data[raw_data.iloc[:,0] == 'orange'])
num_grapefruit = len(raw_data[raw_data.iloc[:,0] == 'grapefruit'])

print("num_orange: {}".format(num_orange))
print("num_grapefruit: {}".format(num_grapefruit))

num_orange: 5000
num_grapefruit: 5000


In [9]:
df = x_normalized.copy()
df['y'] = y

df

Unnamed: 0,diameter,weight,red,green,y
0,0.000000,0.000000,0.740260,0.635294,0
1,0.070423,0.007382,0.662338,0.552941,0
2,0.108228,0.048126,0.532468,0.588235,0
3,0.111935,0.050587,0.623377,0.588235,0
4,0.112676,0.051502,0.597403,0.482353,0
...,...,...,...,...,...
9995,0.918458,0.956395,0.441558,0.541176,1
9996,0.922906,0.960858,0.428571,0.435294,1
9997,0.936249,0.971330,0.688312,0.600000,1
9998,0.960712,0.992160,0.350649,0.482353,1


In [10]:
def partition_dataset(df, num_a=20, num_b=20, val_a=1, val_b=0):
    df_a = df[df.iloc[:,-1] == val_a].sample(num_a)
    df_b = df[df.iloc[:,-1] == val_b].sample(num_b)
    
    df.drop(df_a.index, inplace=True)
    df.drop(df_b.index, inplace=True)
    
    frames = [df_a, df_b]
    df_validation = pd.concat(frames)
    
    return df, df_validation

training, validation = partition_dataset(df, num_a=50, num_b=50)

In [11]:
training

Unnamed: 0,diameter,weight,red,green,y
0,0.000000,0.000000,0.740260,0.635294,0
1,0.070423,0.007382,0.662338,0.552941,0
2,0.108228,0.048126,0.532468,0.588235,0
3,0.111935,0.050587,0.623377,0.588235,0
4,0.112676,0.051502,0.597403,0.482353,0
...,...,...,...,...,...
9995,0.918458,0.956395,0.441558,0.541176,1
9996,0.922906,0.960858,0.428571,0.435294,1
9997,0.936249,0.971330,0.688312,0.600000,1
9998,0.960712,0.992160,0.350649,0.482353,1


In [12]:
validation

Unnamed: 0,diameter,weight,red,green,y
9873,0.808006,0.847554,0.389610,0.400000,1
5946,0.551520,0.534478,0.675325,0.270588,1
8160,0.661231,0.670787,0.272727,0.447059,1
9541,0.753892,0.778598,0.584416,0.400000,1
8791,0.696071,0.708898,0.363636,0.482353,1
...,...,...,...,...,...
4617,0.542624,0.530987,0.623377,0.705882,0
3019,0.433655,0.405036,0.584416,0.552941,0
4718,0.558191,0.546838,0.480519,0.494118,0
1806,0.377317,0.340143,0.727273,0.505882,0


In [13]:
#NOTE: KNN means K (Parameter) Nearest Neighbors
#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

# from sklearn.neighbors import KNeighborsClassifier

x_training = training.iloc[:,:-1].values #.values converts the x values into an array (no more columns and rows)
x_validation = validation.iloc[:,:-1].values

y_training = training['y'].values
y_validation = validation['y'].values

model = KNeighborsClassifier(n_neighbors = 10)

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [14]:
y_validation

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [15]:
# KNN

cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1] # Predicted something as a 1 but the answer is 0.
fn = cm[1][0] # Predicted something as a 0 (negative) but the answer is 1 (postive).

print("True Positive {}".format(tp))
print("True Negative {}".format(tn))
print("False Positive {}".format(fp))
print("False Negative {}".format(fn))

f1 = tp / (tp + (0.5 * (fp + fn)))
mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

print("F1: {}".format(f1)) # percentage from 0-1 (only considers the true positive)
print("MCC: {}".format(mcc)) # + score = model can work well even with bias

True Positive 46
True Negative 45
False Positive 4
False Negative 5
F1: 0.9108910891089109
MCC: 0.8201640492164058


In [16]:
#Naive Bayes A = y, B = x
# from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [17]:
# GAUSSIAN

cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1] # Predicted something as a 1 but the answer is 0.
fn = cm[1][0] # Predicted something as a 0 (negative) but the answer is 1 (postive).

print("True Positive {}".format(tp))
print("True Negative {}".format(tn))
print("False Positive {}".format(fp))
print("False Negative {}".format(fn))

f1 = tp / (tp + (0.5 * (fp + fn)))
mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

print("F1: {}".format(f1)) # percentage from 0-1 (only considers the true positive)
print("MCC: {}".format(mcc)) # + score = model can work well even with bias

True Positive 47
True Negative 45
False Positive 3
False Negative 5
F1: 0.9215686274509803
MCC: 0.8406728074767074


In [18]:
# from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [19]:
# LOGISTIC REGRESSION

cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1] # Predicted something as a 1 but the answer is 0.
fn = cm[1][0] # Predicted something as a 0 (negative) but the answer is 1 (postive).

print("True Positive {}".format(tp))
print("True Negative {}".format(tn))
print("False Positive {}".format(fp))
print("False Negative {}".format(fn))

f1 = tp / (tp + (0.5 * (fp + fn)))
mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

print("F1: {}".format(f1)) # percentage from 0-1 (only considers the true positive)
print("MCC: {}".format(mcc)) # + score = model can work well even with bias

True Positive 47
True Negative 44
False Positive 3
False Negative 6
F1: 0.912621359223301
MCC: 0.8214799971933825


In [20]:
#from sklearn import tree

model = tree.DecisionTreeClassifier()

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [21]:
# DECISION TREE

cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1] # Predicted something as a 1 but the answer is 0.
fn = cm[1][0] # Predicted something as a 0 (negative) but the answer is 1 (postive).

print("True Positive {}".format(tp))
print("True Negative {}".format(tn))
print("False Positive {}".format(fp))
print("False Negative {}".format(fn))

f1 = tp / (tp + (0.5 * (fp + fn)))
mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

print("F1: {}".format(f1)) # percentage from 0-1 (only considers the true positive)
print("MCC: {}".format(mcc)) # + score = model can work well even with bias

True Positive 48
True Negative 48
False Positive 2
False Negative 2
F1: 0.96
MCC: 0.92


In [22]:
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis()

model.fit(x_training, y_training)

predictions = model.predict(x_validation)

predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [23]:
#QUADRATIC DISCRIMINANT ANALYSIS

cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1] # Predicted something as a 1 but the answer is 0.
fn = cm[1][0] # Predicted something as a 0 (negative) but the answer is 1 (postive).

print("True Positive {}".format(tp))
print("True Negative {}".format(tn))
print("False Positive {}".format(fp))
print("False Negative {}".format(fn))

f1 = tp / (tp + (0.5 * (fp + fn)))
mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

print("F1: {}".format(f1)) # percentage from 0-1 (only considers the true positive)
print("MCC: {}".format(mcc)) # + score = model can work well even with bias

True Positive 47
True Negative 50
False Positive 3
False Negative 0
F1: 0.9690721649484536
MCC: 0.9416965821485117
