In [1]:
import pandas as pd

In [2]:
fruit = pd.read_table("c:/Users/alex/ml/fruit_data_with_colors.txt")
fruit.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X, y = fruit.iloc[:,3:], fruit.iloc[:,1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4321)

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
sc = StandardScaler()
sc.fit(X_train)

X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [8]:
for i in range(1,9):
    
    print(i, ":")
    
    cf = KNeighborsClassifier(n_neighbors=i)
    cf.fit(X_train, y_train)
    
    y_pred = cf.predict(X_test)
    
    print(metrics.confusion_matrix(y_test, y_pred))
    print("kNN %):", metrics.accuracy_score(y_test, y_pred)*100)

1 :
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 6]]
kNN %): 100.0
2 :
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 6]]
kNN %): 100.0
3 :
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 6]]
kNN %): 100.0
4 :
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [0 0 0 6]]
kNN %): 100.0
5 :
[[4 0 0 3]
 [0 2 0 0]
 [0 3 0 0]
 [0 0 0 6]]
kNN %): 66.66666666666666
6 :
[[4 0 0 3]
 [0 2 0 0]
 [0 3 0 0]
 [0 1 0 5]]
kNN %): 61.111111111111114
7 :
[[2 0 0 5]
 [0 2 0 0]
 [0 2 0 1]
 [0 0 0 6]]
kNN %): 55.55555555555556
8 :
[[2 0 0 5]
 [0 2 0 0]
 [0 2 0 1]
 [0 2 0 4]]
kNN %): 44.44444444444444


In [9]:
from sklearn.naive_bayes import GaussianNB

In [10]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print(metrics.confusion_matrix(y_test, y_pred))
print("GNB %):", metrics.accuracy_score(y_test, y_pred)*100)

['apple' 'orange' 'apple' 'apple' 'apple' 'mandarin' 'apple' 'lemon'
 'orange' 'orange' 'apple' 'orange' 'apple' 'lemon' 'mandarin' 'apple'
 'apple' 'mandarin']
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [2 0 0 4]]
GNB %): 88.88888888888889


In [17]:
import numpy as np
import math
import sys

In [20]:
def separate_by_class(X, y):
    separated = {}
    c = 0
    for val in y:
        values = X[c]
        if(val not in separated):
            separated[val] = list()
        separated[val].append(values)
        c+=1
    return separated

def individual_distribution(X):
    return [(np.mean(column), np.std(column), len(column)) for column in zip(*X)]

def class_distributions(X, y):
    separated = separate_by_class(X, y)
    distributions = {}
    for val, rows in separated.items():
        distributions[val] = individual_distribution(rows)
    return distributions

def calc_probability(x, mean, stdev):
    exponent = math.exp(-((x-mean)**2/(2*stdev**2)))
    probability = (1/(math.sqrt(2*math.pi)*stdev))*exponent
    if probability == 0:
        probability = sys.float_info.min
    return probability

def class_probability(distributions, row):
    rows = sum([distributions[val][0][2] for val in distributions])
    probabilities = {}
    for val, val_distribution in distributions.items():
        probabilities[val] = math.log(distributions[val][0][2]/float(rows))
        for i in range(len(val_distribution)):
            mean, stdev, count = val_distribution[i]
            probabilities[val] += math.log(calc_probability(row[i], mean, stdev))
    return probabilities

def predict(distributions, row):
    probabilities = class_probability(distributions, row)
    best_guess, best_prob = None, -math.inf
    for val, probability in probabilities.items():
        if best_guess is None or best_prob < probability:
            best_prob = probability
            best_guess = val
    return best_guess

def get_predictions(distributions, X_test):
    predictions = []
    for row in X_test:
        predictions.append(predict(distributions, row))
    return predictions

distributions = class_distributions(X_train, y_train)
y_pred = get_predictions(distributions, X_test)

print(y_pred)
print(metrics.confusion_matrix(y_test, y_pred))
print("Coded GNB %):", metrics.accuracy_score(y_test, y_pred)*100)

['apple', 'orange', 'apple', 'apple', 'apple', 'mandarin', 'apple', 'lemon', 'orange', 'orange', 'apple', 'orange', 'apple', 'lemon', 'mandarin', 'apple', 'apple', 'mandarin']
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [2 0 0 4]]
coded GNB %): 88.88888888888889
