In [None]:
"""
    Ho va Ten: Bui Thi Thanh Xuan
    MSSV: 19110522
"""

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/huynhthanh98/ML/ML-2022/Lab04/lienminh.csv")
df.head(10)

Unnamed: 0,killsDiff,minionsKilledDiff,wardPlacedDiff,firstBlood,heralds,dragons,teamWins
0,3,-2,13,blue,none,none,red
1,0,-66,0,red,red,red,red
2,-4,-17,0,red,none,blue,red
3,-1,-34,28,red,blue,none,red
4,0,-15,58,red,none,red,red
5,2,4,-18,red,none,blue,blue
6,1,61,-39,blue,none,blue,blue
7,-8,52,1,red,none,red,red
8,0,-51,1,red,red,red,red
9,-1,-27,-3,blue,none,blue,blue


# Gaussian Naive Bayes Classifier

In [None]:
def cal_prior(y_train,classes):
    '''
    y_train: (N,)
    classes: array([0,1])
    '''
    #classes = np.unique(y_train)
    prior = []
    for c in classes:
        prior.append(np.sum(y_train==c)/len(y_train))
    return prior

In [None]:
def normal(x, mean, std):
        return (1.0/(np.sqrt(2*np.pi)*std))*np.exp((-(x-mean)**2)/(2*std**2))

In [None]:
def Gaussian_NB(X_train,y_train):
    '''
        Hàm giải bài toán Gaussian Naive Bayes
        ---------------------------
        Tham số:
            X_train: là ma trận (N,3) dạng numerical
            y_train: label của bài toán dạng numerical
        ---------------------------
        Trả về:
            Y_pred: chứa các label được dự đoán dạng numerical
    '''

    classes = np.unique(y_train)

    # Tính XS tiên nghiệm
    prior = cal_prior(y_train,classes)

    #Tính trung bình và phương sai
    means = []
    stds = []
    for c in classes:
        means.append(X_train[y_train== c].mean(axis = 0))
        stds.append(X_train[y_train == c].std(axis = 0))
    
    #Tính likelihood
    Y_pred = []
   
    for row in range(0,X_train.shape[0]):
        likelihood = [1]*len(classes)
        for l_abel in range(len(classes)):
            for val in range(X_train.shape[1]):
                likelihood[l_abel] *= normal(X_train[row][val],means[l_abel][val],stds[l_abel][val])

        #Tính XS hậu nghiệm
        post_prob = [1]*len(classes)

        for j in range(len(classes)):
            post_prob[j] = likelihood[j] * prior[j]
    
        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
df_enc = MultiColumnLabelEncoder(columns=['teamWins']).fit_transform(df)
# Prepare data
X_train_num = df_enc[['killsDiff', 'minionsKilledDiff', 'wardPlacedDiff']].values
y_train_num = df_enc.teamWins.to_numpy()

In [None]:
y_pred_num = Gaussian_NB(X_train_num,y_train_num)
print(accuracy_score(y_pred_num, y_train_num))

0.7082700678206296


In [None]:
from sklearn.naive_bayes import GaussianNB
# Sklearn model
clf = GaussianNB()
clf.fit(X_train_num, y_train_num)
#Accuracy
print(clf.score(X_train_num, y_train_num))

0.7082700678206296


# Categorical Naive Bayes Classifier

In [None]:
def cal_prior(y_train,classes):
    '''
    y_train: (N,)
    classes: array(['blue','red])
    '''
    #classes = np.unique(y_train)
    prior = []
    for c in range(len(classes)):
        prior.append(np.sum(y_train == classes[c])/len(y_train))
    return prior

In [None]:
def likelihood_cat(X_train,label,y,f):
    return np.sum(X_train[y == label]==f)/(np.sum(y==label))

In [None]:
def Categorical_NB(X,y):
    '''
        Hàm giải bài toán Categorical Naive Bayes
        ---------------------------
        Tham số:
            X: là ma trận (N,3) dạng category
            y: label của bài toán dạng category
        ---------------------------
        Trả về:
            Y_pred: chứa các label được dự đoán dạng numerical
    '''

    classes = np.unique(y)

    #Tính XS tiên nghiệm
    prior = cal_prior(y,classes)

    
    #Tính likelihood
    Y_pred = []
   
    for row in range(0,X.shape[0]):
        likelihood = [1]*len(classes)
        for l_abel in range(len(classes)):
            for f in range(X.shape[1]):
                likelihood[l_abel] *= likelihood_cat(X[:,f],classes[l_abel],y,X[row,:][f])
        #Tính XS hậu nghiệm
        post_prob = [1]*len(classes)

        for j in range(len(classes)):
            post_prob[j] = likelihood[j] * prior[j]
    
        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

In [None]:
# Prepare data
X_train_cat = df[['firstBlood','heralds','dragons']].values
y_train_cat = df.teamWins.to_numpy()

In [None]:
y_pred_cat = Categorical_NB(X_train_cat,y_train_cat)
print(accuracy_score(y_pred_cat,y_train_num)) #Truyền y_train_num vì y_pred_cat đang ở dạng numerical 

0.6298208320680231


In [None]:
#Sklearn
X_train_enc = MultiColumnLabelEncoder(columns = ['firstBlood','heralds','dragons']).fit_transform(df_enc)
x_train_enc = X_train_enc[["firstBlood","heralds","dragons"]].values
from sklearn.naive_bayes import CategoricalNB
model = CategoricalNB()
model.fit(x_train_enc,y_train_num)
print(model.score(x_train_enc,y_train_num))

0.6298208320680231
