In [None]:
'''
Dataset : 
    project3_dataset1.txt, 
    project3_dataset2.txt
'''

## Naive Bayes

In [765]:
import os
import pandas as pd
import math

accuracy = precision = recall = fMeasure = 0

# Function to read the Dataset file 
def folder(pathname):
    path = os.getcwd()
    file = path + pathname
    df = pd.read_csv(file, delimiter="\t", header=None,index_col=False)
    return df

# Separating data & output labels
def separate(df):
    data = df_numpy[:,0:-1]
    labels = df_numpy[:,-1].reshape((df_numpy.shape[0],1)).astype(int)
    return data, labels

# Splitting up the dataset into Training Dataset & Testing Dataset
def split(test_ind,cross_valid_list):
    test = cross_valid_list[test_ind]
    train = np.vstack([x for i,x in enumerate(cross_valid_list) if i != test_ind])
    return np.asarray(test),np.asarray(train)

# Function to remove categorical data from the dataset
def get_nonCat_data(data, indices):
    return np.transpose(np.asarray([data[:,i] for i in range(len(data[0])) if i not in indices]))

# Function to Square a numpy array
def squared(prob):
    return np.multiply(prob,prob)
    
# Function to calculate the Posterior Probability using mean and standard deviation.
def posteriorProb(mean, stdv, test):
    prob = test[:,0:-1] - mean
    prob = -1 * squared(prob) / (2 * squared(stdv))
    prob = np.array(prob,dtype =np.float32)
    prob = np.exp(prob)
    prob = prob/(math.sqrt(math.pi*2)*stdv)
    prob = np.prod(prob, axis = 1)
    return prob

# Function to predict the class for test data using the probability.
def predict(prob_one,prob_zero,test):
    return [1 if prob_one[i] > prob_zero[i] else 0 for i in range(len(test))]
  
# Function to return false positives (m01), false negatives (m10), true positives (m11) and true negatives (m00)
def metric(actual,predicted):
    m11 = m00 = m10 = m01 = 0
    for i in range(len(actual)):
            if actual[i] == predicted[i]:
                if actual[i] == 1: 
                    m11 = m11 + 1 #a
                else:
                    m00 = m00 + 1 #d
            
            else:
                if actual[i] == 1: 
                    m10 = m10 + 1 #b
                else:
                    m01 = m01 + 1 #c
    return m11, m10, m01, m00

dataType = input("Is text file different for Training and Test (y or n) : ")

if dataType == 'n':
    inp = input("Manual Input (yes, ignore if no) : ")
    if any(inp):
        file = input("Enter the File Name: ")
        df = folder(str('/'+ file))
        df_numpy = df.to_numpy()
        df_numpy_clone = np.copy(df_numpy)

        # Checking for String data and changing them to Numerical data
        Strings = []
        for i in range(len(df_numpy[0])):
            if isinstance(df_numpy[0][i],str):
                Strings.append(i)
        for i in Strings:
            unique_classes = np.unique(df_numpy[:,i])
            repl_vals = list(range(len(unique_classes)))
            dictionary = dict(zip(unique_classes,repl_vals))
            for j in range(len(df_numpy[:,i])):
                df_numpy[j][i] = dictionary.get(df_numpy[j][i])

        inp =input("Give the input seperated by comma:")
        values=[i for i in inp.split(',')]
        testdata=list(values)

        testdata=np.array(testdata).astype('object')
        testdata=np.reshape(testdata,(1,4))
        #print(testdata)

        for i in Strings:
            unique_classes = np.unique(df_numpy_clone[:,i])
            repl_vals = list(range(len(unique_classes)))
            dictionary = dict(zip(unique_classes,repl_vals))
            #print(dictionary)
            for j in range(len(testdata[:,i])):
                testdata[j][i] = dictionary.get(testdata[j][i])


        # Function to return test and train splits
        test,train = testdata,df_numpy
        zero_class = []
        one_class = []

        # Adding different class labels to different arrays
        for j in range(len(train)):
            if int(train[j,-1])==1:
                one_class.append(train[j,:])
            else:
                zero_class.append(train[j,:])

        # Changing the type of both arrays to numpy array
        zero_class = np.asarray(zero_class)
        one_class = np.asarray(one_class)


        # Execute the code if there are atleast one categorical feature in the dataset
        # Calculating the prior probabilities of both class labels zero and one. 
        if len(Strings) != 0:
            string_prior_zero = {}
            string_prior_one = {}
            for j in Strings:
                string_prior_zero[j] = {}
                string_prior_one[j] = {}
                for k in np.unique(train[:,j]):
                    zero_count = list(zero_class[:,-1].astype(int)).count(0)
                    one_count = list(one_class[:,-1].astype(int)).count(1)                
                    prior_zero = float(list(zero_class[:,j]).count(k))/zero_count
                    prior_one = float(list(one_class[:,j]).count(k))/one_count
                    string_prior_zero[j][k] = prior_zero
                    string_prior_one[j][k] = prior_one

        # Calculating prior probabilities of different classes
        prior_prob_zero = float(list(train[:,-1]).count(0))/len(train)
        prior_prob_one = float(list(train[:,-1]).count(1))/len(train)

        #Calculating probability for categorical explicitly and adding probabilities to in the numpy array at their respective indices
        string_prob_zero = np.empty(test.shape[0])
        string_prob_one = np.empty(test.shape[0])
        string_prob_one.fill(1.0)
        string_prob_zero.fill(1.0)
        if len(Strings) != 0:
            for t in range(len(test)):
                for i in Strings:
                    string_prob_one[t] *= string_prior_one[i][test[t][i]]
                    string_prob_zero[t] *= string_prior_zero[i][test[t][i]]

        # Get test data without categorical data
        test_ = get_nonCat_data(test, Strings)

        s = (prior_prob_one * string_prob_one) + (prior_prob_zero * string_prob_zero)

        # Calculating Posterior probabilities using respective means, std_deviation for test dataset 
        prob_zero = (prior_prob_zero * string_prob_zero)/s
        prob_one = (prior_prob_one * string_prob_one)/s

        print('Probability of 0: {}'.format(prob_zero))
        print('Probability of 1: {}'.format(prob_one))
        # predict the class of each data row in the test dataset
        predicted = predict(prob_one,prob_zero,test)
        print("Predicted Label: {}".format(predicted))

    else:
        file = input("Enter the File Name: ")
        df = folder(str('/'+ file))
        df_numpy = df.to_numpy()
        # Checking for String data and changing them to Numerical data
        Strings = []
        for i in range(len(df_numpy[0])):
            if isinstance(df_numpy[0][i],str):
                Strings.append(i)
        for i in Strings:
            unique_classes = np.unique(df_numpy[:,i])
            repl_vals = list(range(len(unique_classes)))
            dictionary = dict(zip(unique_classes,repl_vals))
            for j in range(len(df_numpy[:,i])):
                df_numpy[j][i] = dictionary.get(df_numpy[j][i])


        # No. of K-Folds
        folds = int(input("Enter the Number of folds (less than {}): ".format(df_numpy.shape[0])))
        if folds:
            crossSplits = np.array_split(df_numpy,folds)
        else:
            crossSplits = np.array_split(df_numpy,df_numpy.shape[0]-1)


        # Cross validating k-times 
        for i in range(len(crossSplits)):

            # Function to return test and train splits
            test,train = split(i,crossSplits)
            zero_class = []
            one_class = []

            # Adding different class labels to different arrays
            for j in range(len(train)):
                if int(train[j,-1])==1:
                    one_class.append(train[j,:])
                else:
                    zero_class.append(train[j,:])

            # Changing the type of both arrays to numpy array
            zero_class = np.asarray(zero_class)
            one_class = np.asarray(one_class)

            # Execute the code if there are atleast one categorical feature in the dataset
            # Calculating the prior probabilities of both class labels zero and one. 
            if len(Strings) != 0:
                string_prior_zero = {}
                string_prior_one = {}
                for j in Strings:
                    string_prior_zero[j] = {}
                    string_prior_one[j] = {}
                    for k in np.unique(train[:,j]):
                        zero_count = list(zero_class[:,-1].astype(int)).count(0)
                        one_count = list(one_class[:,-1].astype(int)).count(1)                
                        prior_zero = float(list(zero_class[:,j]).count(k))/zero_count
                        prior_one = float(list(one_class[:,j]).count(k))/one_count
                        string_prior_zero[j][k] = prior_zero
                        string_prior_one[j][k] = prior_one

            # Calculating prior probabilities of different classes
            prior_prob_zero = float(list(train[:,-1]).count(0))/len(train)
            prior_prob_one = float(list(train[:,-1]).count(1))/len(train)

            # Excluding the categorical data as we calculated priors for those already.
            train_ = get_nonCat_data(train,Strings)
            zero = [row[0:-1] for row in train_ if row[-1] == 0]
            one = [row[0:-1] for row in train_ if row[-1] == 1]
            zero = np.array(zero).astype(np.float64)
            one = np.array(one).astype(np.float64)
            mean_zero = np.mean(zero,axis=0)
            mean_one = np.mean(one,axis=0)
            std_zero = np.std(zero,axis=0)
            std_one = np.std(one,axis=0)

            #Calculating probability for categorical explicitly and adding probabilities to in the numpy array at their respective indices
            string_prob_zero = np.empty(test.shape[0])
            string_prob_one = np.empty(test.shape[0])
            string_prob_one.fill(1.0)
            string_prob_zero.fill(1.0)
            if len(Strings) != 0:
                for t in range(len(test)):
                    for i in Strings:
                        string_prob_one[t] *= string_prior_one[i][test[t][i]]
                        string_prob_zero[t] *= string_prior_zero[i][test[t][i]]

            # Get test data without categorical data
            test_ = get_nonCat_data(test, Strings)

            # Calculating Posterior probabilities using respective means, std_deviation for test dataset 
            prob_zero = prior_prob_zero * np.multiply(posteriorProb(mean_zero,std_zero,test_),string_prob_zero)
            prob_one = prior_prob_one * np.multiply(posteriorProb(mean_one,std_one,test_),string_prob_one)

            # predict the class of each data row in the test dataset
            predicted = predict(prob_one,prob_zero,test)

            # Metrics
            m11, m10, m01, m00 = metric(test[:,-1],np.asarray(predicted))
            accuracy += float(m11 + m00)/(m11 + m10 + m01 + m00) if (m11 + m10 + m01 + m00) else 0
            precision += float(m11)/(m11 + m01) if (m11 + m01) else 0
            fMeasure += float(2*m11)/((2*m11) + m10 + m01) if ((2*m11) + m10 + m01) else 0
            recall += float(m11)/(m11+m10) if (m11+m10) else 0

elif dataType == 'y':
    train_data_file = input("Enter the Training Data File Name: ")
    df_train = folder(str('/'+ train_data_file))
    df_numpy_train = df_train.to_numpy()
    
    test_data_file = input('Enter the Testing Data File Name: ')
    df_test = folder(str('/'+ test_data_file))
    df_numpy_test = df_test.to_numpy()
    
    Strings = []
    for i in range(len(df_numpy[0])):
        if isinstance(df_numpy[0][i],str):
            Strings.append(i)
    for i in Strings:
        unique_classes = np.unique(df_numpy[:,i])
        repl_vals = list(range(len(unique_classes)))
        dictionary = dict(zip(unique_classes,repl_vals))
        for j in range(len(df_numpy[:,i])):
            df_numpy[j][i] = dictionary.get(df_numpy[j][i])
            
    test,train = df_numpy_test,df_numpy_train
    zero_class = []
    one_class = []

    # Adding different class labels to different arrays
    for j in range(len(train)):
        if int(train[j,-1])==1:
            one_class.append(train[j,:])
        else:
            zero_class.append(train[j,:])

    # Changing the type of both arrays to numpy array
    zero_class = np.asarray(zero_class)
    one_class = np.asarray(one_class)

    # Execute the code if there are atleast one categorical feature in the dataset
    # Calculating the prior probabilities of both class labels zero and one. 
    if len(Strings) != 0:
        string_prior_zero = {}
        string_prior_one = {}
        for j in Strings:
            string_prior_zero[j] = {}
            string_prior_one[j] = {}
            for k in np.unique(train[:,j]):
                zero_count = list(zero_class[:,-1].astype(int)).count(0)
                one_count = list(one_class[:,-1].astype(int)).count(1)                
                prior_zero = float(list(zero_class[:,j]).count(k))/zero_count
                prior_one = float(list(one_class[:,j]).count(k))/one_count
                string_prior_zero[j][k] = prior_zero
                string_prior_one[j][k] = prior_one

    # Calculating prior probabilities of different classes
    prior_prob_zero = float(list(train[:,-1]).count(0))/len(train)
    prior_prob_one = float(list(train[:,-1]).count(1))/len(train)

    # Excluding the categorical data as we calculated priors for those already.
    train_ = get_nonCat_data(train,Strings)
    zero = [row[0:-1] for row in train_ if row[-1] == 0]
    one = [row[0:-1] for row in train_ if row[-1] == 1]
    zero = np.array(zero).astype(np.float64)
    one = np.array(one).astype(np.float64)
    mean_zero = np.mean(zero,axis=0)
    mean_one = np.mean(one,axis=0)
    std_zero = np.std(zero,axis=0)
    std_one = np.std(one,axis=0)

    #Calculating probability for categorical explicitly and adding probabilities to in the numpy array at their respective indices
    string_prob_zero = np.empty(test.shape[0])
    string_prob_one = np.empty(test.shape[0])
    string_prob_one.fill(1.0)
    string_prob_zero.fill(1.0)
    if len(Strings) != 0:
        for t in range(len(test)):
            for i in Strings:
                string_prob_one[t] *= string_prior_one[i][test[t][i]]
                string_prob_zero[t] *= string_prior_zero[i][test[t][i]]

    # Get test data without categorical data
    test_ = get_nonCat_data(test, Strings)

    # Calculating Posterior probabilities using respective means, std_deviation for test dataset 
    prob_zero = prior_prob_zero * np.multiply(posteriorProb(mean_zero,std_zero,test_),string_prob_zero)
    prob_one = prior_prob_one * np.multiply(posteriorProb(mean_one,std_one,test_),string_prob_one)

    # predict the class of each data row in the test dataset
    predicted = predict(prob_one,prob_zero,test)

    # Metrics
    m11, m10, m01, m00 = metric(test[:,-1],np.asarray(predicted))
    accuracy += float(m11 + m00)/(m11 + m10 + m01 + m00) if (m11 + m10 + m01 + m00) else 0
    precision += float(m11)/(m11 + m01) if (m11 + m01) else 0
    fMeasure += float(2*m11)/((2*m11) + m10 + m01) if ((2*m11) + m10 + m01) else 0
    recall += float(m11)/(m11+m10) if (m11+m10) else 0
    
    print("Accuracy : {}".format(accuracy*100))
    print("Precision : {}".format(precision*100))
    print("Recall : {}".format(recall*100))
    print("F1 - Measure : {}".format(fMeasure*100))

if dataType == 'n' and not any(inp):
    print("Accuracy : {}".format(accuracy*10))
    print("Precision : {}".format(precision*10))
    print("Recall : {}".format(recall*10))
    print("F1 - Measure : {}".format(fMeasure*10))
    
    print('\n')
    print('After Normalizing (only when fold value is high)')
    print("Accuracy : {}".format(accuracy/folds*100)) 
    print("Precision : {}".format(precision/folds*100))
    print("Recall : {}".format(recall/folds*100))
    print("F1 - Measure : {}".format(fMeasure/folds*100))

Is text file different for Training and Test (y or n) : n
Manual Input (yes, ignore if no) : yes
Enter the File Name: project3_dataset4.txt
Give the input seperated by comma:sunny,hot,high,weak
Probability of 0: [0.79541735]
Probability of 1: [0.20458265]
Predicted Label: [0]
