# Classification Time!

In [2]:
def calculateBayesianProbability(Cov1, Cov2, Query, Mean1, Mean2,
                                 Num1, Num2):
    x = Num1 * calculatePDF(Cov1, Query, Mean1)
    
    y = Num2 * calculatePDF(Cov2, Query, Mean2)

    
    probability = x / (x + y)
    return probability

def calculatePDF(Cov, Query, Mean) :
    return ((1/(2 * np.pi * np.sqrt(np.linalg.det(Cov)))) * 
            np.exp(-(1/2) *(np.subtract(Query, Mean)) * np.linalg.inv(Cov) * 
                  np.subtract(Query, Mean).transpose()))


In [3]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd

def generate_classifier_nb(file):
    feature_vects_df = pd.read_csv(file)
#     print (feature_vects_df.iloc[2])
#     print (feature_vects_df.iloc[3054])
    classLabels = []
    feature_vectors = []
    
    vectors = {}
    
    targets = []
    test_vectors = []
    
    for row in feature_vects_df.iterrows():
        index, data = row
        temp_list = list(data)
#         classLabels.append(temp_list[0])
#         feature_vectors.append(temp_list[1:])
        if (temp_list[0] not in vectors) :
            vectors[temp_list[0]] = [temp_list[1:]]
        else:
            vectors[temp_list[0]].append(temp_list[1:])
    
    for key in vectors.keys() :
        number_of_vectors = len(vectors[key])
        for i in range(0, number_of_vectors):
            if i < int (number_of_vectors * .7) - 1:
                feature_vectors.append(vectors[key][i])
                classLabels.append(key)
            else :
                test_vectors.append(vectors[key][i])
                targets.append(key)     
        
    
    clNP = np.array(classLabels)
    fvNP = np.array(feature_vectors)    
    
    gnb = GaussianNB()
    
    gnb = gnb.fit(fvNP, clNP)
    
    return gnb, np.array(test_vectors), targets, len(classLabels)

def generate_test_set (file):
    feature_vects_df = pd.read_csv(file)

    targets = []
    features = []
    for row in feature_vects_df.iterrows():
        index, data = row
        temp_list = list(data)
        targets.append(temp_list[0])
        features.append(temp_list[1:])
    return np.array(targets), np.array(features)
            


## Classifying 

In [8]:
'''
    grand piano + bass guitar classifier
'''
print ("GrandPiano vs Bass Guitar")
two_instrument_classifier, testset, target, numTraining = generate_classifier_nb("./feature_vectors_2_instruments.csv")
predictions = two_instrument_classifier.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))
'''
    three pianos
'''
print ("GrandPiano vs Rhodes vs Clav")
three_pianos_classifier, testset, target, numTraining = generate_classifier_nb("./feature_vectors_3_instruments.csv")
predictions = three_pianos_classifier.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))
'''
    all 4
'''
print ("GrandPiano vs Rhodes vs Clav vs Bass")
four_instrument_classifier, testset, target, numTraining = generate_classifier_nb("./feature_vectors_4_instruments.csv")
predictions = four_instrument_classifier.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))
'''
    5 instruments
'''
print ("GrandPiano vs Rhodes vs Clav vs Bass vs Clarinet")
five_instrument_classifier, testset, target, numTraining = generate_classifier_nb("./feature_vectors_5_instruments.csv")
predictions = five_instrument_classifier.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))
'''
    Clarinet vs Grand Piano
'''
print ("Clarinet vs Grand Piano")
clar_vs_piano, testset,target, numTraining = generate_classifier_nb("./clar_vs_piano.csv")
predictions = clar_vs_piano.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))

GrandPiano vs Bass Guitar
	Number of mislabeled points out of a total 674 points : 0
	Ratio correct:1.000000
	TrainingSetSize : 1566
GrandPiano vs Rhodes vs Clav
	Number of mislabeled points out of a total 1011 points : 115
	Ratio correct:0.886251
	TrainingSetSize : 2349
GrandPiano vs Rhodes vs Clav vs Bass
	Number of mislabeled points out of a total 1348 points : 118
	Ratio correct:0.912463
	TrainingSetSize : 3132
GrandPiano vs Rhodes vs Clav vs Bass vs Clarinet
	Number of mislabeled points out of a total 831 points : 82
	Ratio correct:0.901324
	TrainingSetSize : 1921
Clarinet vs Grand Piano
	Number of mislabeled points out of a total 252 points : 12
	Ratio correct:0.952381
	TrainingSetSize : 580


In [3]:
'''
    grand piano + bass guitar classifier
'''
print ("GrandPiano vs Bass Guitar Shifted")
two_instrument_classifier, testset, target, numTraining = generate_classifier_nb("./feature_vectors_2_instruments_shift.csv")
predictions = two_instrument_classifier.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))
'''
    three pianos
'''
print ("GrandPiano vs Rhodes vs Clav Shifted")
grand_vs_rhodes_vs_clav_shift, testset, target, numTraining = generate_classifier_nb("./feature_vectors_3_instruments_shift.csv")
predictions = grand_vs_rhodes_vs_clav_shift.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))
'''
    all 4
'''
print ("GrandPiano vs Rhodes vs Clav vs Bass Shifted")
grand_vs_rhodes_vs_clav_vs_bass, testset, target, numTraining = generate_classifier_nb("./feature_vectors_4_instruments_shift.csv")
predictions = grand_vs_rhodes_vs_clav_vs_bass.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))

'''
    Clarinet vs Grand shifted
'''
print ("GrandPiano vs Clarinet Shifted")
grand_vs_clar_shift, testset, target, numTraining = generate_classifier_nb("./clar_vs_piano_shift.csv")
predictions = grand_vs_clar_shift.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))

'''
    Clarinet vs Rhodes shifted
'''
print ("Rhodes vs Clarinet Shifted")
rhodes_vs_clar, testset, target, numTraining = generate_classifier_nb("./clar_vs_rhodes_shift.csv")
predictions = rhodes_vs_clar.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))


'''
    Violin vs Grand shifted
'''
print ("Violin vs Grand Shifted")
violin_vs_grand_shifted, testset, target, numTraining = generate_classifier_nb("./violin_vs_grand_shift.csv")
predictions = violin_vs_grand_shifted.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))

GrandPiano vs Bass Guitar Shifted
	Number of mislabeled points out of a total 2690 points : 4
	Ratio correct:0.998513
	TrainingSetSize : 6270
GrandPiano vs Rhodes vs Clav Shifted
	Number of mislabeled points out of a total 4035 points : 228
	Ratio correct:0.943494
	TrainingSetSize : 9405
GrandPiano vs Rhodes vs Clav vs Bass Shifted
	Number of mislabeled points out of a total 5380 points : 235
	Ratio correct:0.956320
	TrainingSetSize : 12540
GrandPiano vs Clarinet Shifted
	Number of mislabeled points out of a total 1250 points : 50
	Ratio correct:0.960000
	TrainingSetSize : 2910
Rhodes vs Clarinet Shifted
	Number of mislabeled points out of a total 1250 points : 178
	Ratio correct:0.857600
	TrainingSetSize : 2910
Violin vs Grand Shifted
	Number of mislabeled points out of a total 1202 points : 2
	Ratio correct:0.998336
	TrainingSetSize : 2798


In [11]:
'''
    Game
'''
print ("Game")
target, testset = generate_test_set("./game_test_set.csv")
print (len(target))
print (len(testset))

predictions = grand_vs_rhodes_vs_clav_vs_bass.predict(testset)

print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0]))

Game
16
16
	Number of mislabeled points out of a total 16 points : 2
	Ratio correct:0.875000


In [7]:
'''
    Clarinet vs Violin Less Samples
'''
print ("Clarinet vs Violin Less Samples Sum Heuristic")
clar_vs_piano_sum_less, testset,target, numTraining = generate_classifier_nb("./clar_vs_violin_less_samples_sum_heuristic.csv")
predictions = clar_vs_piano_sum_less.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))

'''
    Clarinet vs Violin More Samples
'''
print ("Clarinet vs Violin More Samples Sum Heuristic")
clar_vs_piano_sum_more, testset,target, numTraining = generate_classifier_nb("./clar_vs_violin_more_samples_sum_heuristic.csv")
predictions = clar_vs_piano_sum_more.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))

Clarinet vs Violin Less Samples Sum Heuristic
	Number of mislabeled points out of a total 55 points : 19
	Ratio correct:0.654545
	TrainingSetSize : 119
Clarinet vs Violin More Samples Sum Heuristic
	Number of mislabeled points out of a total 917 points : 211
	Ratio correct:0.769902
	TrainingSetSize : 2128


In [6]:
'''
    Clarinet vs Violin Less Samples
'''
print ("Clarinet vs Violin Less Samples")
clar_vs_vio_less, testset,target, numTraining = generate_classifier_nb("./clar_vs_violin_less_samples.csv")
predictions = clar_vs_vio_less.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))

'''
    Clarinet vs Violin More Samples
'''
print ("Clarinet vs Violin More Samples")
clar_vs_vio_more, testset,target, numTraining = generate_classifier_nb("./clar_vs_violin_more_samples.csv")
predictions = clar_vs_vio_more.predict(testset)
print("\tNumber of mislabeled points out of a total %d points : %d\n\tRatio correct:%f\n\tTrainingSetSize : %i" 
      % (testset.shape[0],(target != predictions).sum(),(target == predictions).sum()/testset.shape[0], numTraining))

Clarinet vs Violin Less Samples
	Number of mislabeled points out of a total 55 points : 20
	Ratio correct:0.636364
	TrainingSetSize : 121
Clarinet vs Violin More Samples
	Number of mislabeled points out of a total 927 points : 237
	Ratio correct:0.744337
	TrainingSetSize : 2153
