In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def calc_prob_target_class(data,target):
    labels = list(data[target].unique())
    prob_target_all = list()
    for label in labels:
        prob_target_all.append(len(data[target] == label)/len(data)) # we are doing p(x) = no of events that x occurs/total no. of events
    return prob_target_all

In [3]:
# This function is used to calculate the probability of each of the features for
# each of the classes in the dataset

def calc_prob_x_given_y(data, target, feature_name, feature_value, label):
    features = list(data.columns)
    data = data[data[target] == label]
    mn,std = data[feature_name].mean(),data[feature_name].std()
    return (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feature_value-mn)**2 / (2 * std**2 )))

In [4]:
def predict(data, X, y):
    features = list(data.columns)[:-2] # get the name of the features 
    prior_prob = calc_prob_target_class(data, y)
    
    y_pred = list()
    itr = 1
    for x in X:
        #print('predicting for iteration: ',itr)
        itr += 1
        labels = list(data[y].unique())
        likelihood = [1]*len(labels) # initialize all the likelihood to 1
        for i in range(len(labels)):
            for j in range(len(features)):
                likelihood[i] *= calc_prob_x_given_y(data, y, features[j], x[j], labels[i])
        posterior_prob = [1]*len(labels)
        for i in range(len(labels)):
            posterior_prob[i] = likelihood[i]*prior_prob[i]
        y_pred.append(np.argmax(posterior_prob))
    return np.array(y_pred)

In [5]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('Transformed.csv')
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,PC1,PC2,PC3,genre,track_name
0,-0.99045,0.999392,-0.159711,15,C'est beau de faire un Show
1,-1.209613,0.273042,-0.684229,15,Perdu d'avance (par Gad Elmaleh)
2,2.112402,0.353162,-1.866821,15,Don't Let Me Be Lonely Tonight
3,1.954505,-0.186844,0.251315,15,Dis-moi Monsieur Gordon Cooper
4,2.935588,0.391576,-1.123134,15,Ouverture


In [6]:
separated = dict()
vals = df.values
for i in range(len(vals)):
    vector = vals[i]
    class_value = vector[-2]
    if (class_value not in separated):
        separated[class_value] = list()
    separated[class_value].append(vector)

lst = list()
for key in separated.keys():
    for ind,val in enumerate(separated[key]):
        if ind == int(len(separated[key])*0.01):
            break
        else:
            lst.append(separated[key][ind])

data = pd.DataFrame(lst,columns=['PC1','PC2','PC3','genre','title'])
data.shape

(2312, 5)

In [None]:
arr = data.values
X = arr[:,0:3]
y = arr[:,3]
train, test = train_test_split(df, test_size=0.20, random_state = 4)
X_test = test.iloc[:,:-2].values
y_test = test.iloc[:,-2].values
y_pred = predict(train, X_test, 'genre')

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)