## Problem Statement: Name Classification using Naive Bayes

In [None]:
import os
import pandas as pd
import numpy as np

# Reading the Dataset

Adding column names beacuse data is missing column

In [None]:

dirpath = '../data'
columns = ['names', 'label']
us = pd.read_csv(os.path.join(dirpath,'us.txt'),names = columns )

In [None]:
us.head()

Unnamed: 0,names,label
0,Timothy Moses,
1,Anna Barajas,
2,James Caldwell,
3,Mr. Michael Cole,
4,Jeffrey Collier,


### Doing the same with every language file

In [None]:
japan = pd.read_csv(os.path.join(dirpath,'japan.txt'),names = columns)
greek = pd.read_csv(os.path.join(dirpath,'greek.txt'),names = columns)
arabic = pd.read_csv(os.path.join(dirpath,'arabic.txt'),names = columns)

### Adding labels manually to every language dataset

In [None]:
label = ['us']*len(us)

In [None]:
us['label'] = label

In [None]:
us.head()

Unnamed: 0,names,label
0,Timothy Moses,us
1,Anna Barajas,us
2,James Caldwell,us
3,Mr. Michael Cole,us
4,Jeffrey Collier,us


In [None]:
label1 = ['japan']*len(japan)
label2 = ['greek']*len(greek)
label3 = ['arabic']*len(arabic)
japan['label'] = label1
greek['label'] = label2
arabic['label'] = label3

### Combining all 4 languages to create a single dataframe

In [None]:
finalnames = pd.concat([us,japan,greek,arabic],axis = 0,ignore_index = True)

### Checking for null or irrelevant data

In [None]:
finalnames.shape
finalnames.isnull().sum()

names    0
label    0
dtype: int64

# Spliting dataset into 70 % train and 30 % test

In [None]:
from sklearn.model_selection import train_test_split

train , test = train_test_split(finalnames,test_size = 0.3)

In [None]:
train.shape

(3600, 2)

# Function for Naive Bayes classifier

Preprocessing function to split names into words

In [None]:
import string
def preprocess(line):
    words = line.strip().split(" ")
    #print(words)
    return words

### Function for calculating Naive Bayes probability and predicting on test dataset

We are using the parameter estimation formula in the given resources in order to implement Naive Bayes classifier 


In [None]:
import math
def result(sentence,vals):

    us_words,arabic_words,japan_words,greek_words = vals[0],vals[1],vals[2],vals[3]
    us_names,arabic_names,japan_names,greek_names = vals[4],vals[5],vals[6],vals[7]
    total_us_words,total_arabic_words,total_japan_words,total_greek_words = vals[8],vals[9],vals[10],vals[11]
    list_words = preprocess(sentence)
    p_us,p_arabic,p_japan,p_greek = 0,0,0,0
    total_us_words += len(us_words)*0.3
    total_arabic_words += len(arabic_words)* 0.3
    total_japan_words += len(japan_words)* 0.3
    total_greek_words += len(greek_words)* 0.3

    for word in list_words:
        fus = 0
        if word in us_words.keys():fus = us_words[word]
        farab = 0
        if word in arabic_words.keys():farab = arabic_words[word]
        fjap = 0
        if word in japan_words.keys():fjap = japan_words[word]
        fgreek = 0
        if word in greek_words.keys():fgreek = greek_words[word]

        p_us += math.log2((0.3 + fus)/total_us_words)
        p_arabic += math.log2((0.3 + farab)/total_arabic_words)
        p_japan += math.log2((0.3 + fjap)/total_japan_words)
        p_greek += math.log2((0.3 + fgreek)/total_greek_words)
    
    total = us_names + arabic_names + japan_names + greek_names
    usa = us_names/(total) 
    arabica = arabic_names/(total) 
    japana = japan_names/(total) 
    greeka = greek_names/(total)
    
    p_us += math.log2(usa)
    p_arabic += math.log2(arabica)
    p_japan += math.log2(japana)
    p_greek += math.log2(greeka)

    maxx = max(p_us,p_arabic, p_japan,p_greek)

    if maxx == p_us:return 'us'
    elif maxx == p_arabic:return 'arabic'
    elif maxx == p_japan:return 'japan'
    elif maxx == p_greek:return 'greek'


## Function to preprocess dataset and generate vocabulary

We have used dictionaries in order to create a vocabulary for the names 

In [None]:
def classifier(train_data, test_data):

    us_words,arabic_words,greek_words,japan_words = {},{},{},{}
    us_names,arabic_names,japan_names,greek_names = 0,0,0,0
    total_us_words,total_arabic_words,total_japan_words,total_greek_words = 0,0,0,0

    for i in range(len(train)):
        processed_words = preprocess(train_data['names'].iloc[i])
        #print(processed_words)
        if train_data['label'].iloc[i] == 'us':
            us_names += 1
            total_us_words += len(processed_words)
            for j in processed_words:
                if j not in us_words:us_words[j] = 1
                else:us_words[j] += 1

        elif train_data['label'].iloc[i] == 'arabic':
            arabic_names += 1
            total_arabic_words += len(processed_words)
            for j in processed_words:
                if j not in arabic_words:arabic_words[j] = 1
                else:arabic_words[j] += 1

        elif train_data['label'].iloc[i] == 'greek':
            greek_names += 1
            total_greek_words += len(processed_words)
            for j in processed_words:
                if j not in greek_words:greek_words[j] = 1
                else:greek_words[j] += 1

        else:
            japan_names += 1
            total_japan_words += len(processed_words)
            for j in processed_words:
                if j not in japan_words:japan_words[j] = 1
                else:japan_words[j] += 1

    
    vals = []
    vals.append(us_words)
    vals.append(arabic_words)
    vals.append(japan_words)
    vals.append(greek_words)

    vals.append(us_names)
    vals.append(arabic_names)    
    vals.append(japan_names)
    vals.append(greek_names)

    vals.append(total_us_words)
    vals.append(total_arabic_words)
    vals.append(total_japan_words)
    vals.append(total_greek_words)

    res = []

    for i in range(len(test)):
        ans = result(test_data['names'].iloc[i],vals)
        res.append(ans)

    return res
    

# Calculating accuracy of the Model

In [None]:
results= classifier(train, test)

correct_ct = sum([ (results[i] == test['label'].iloc[i]) for i in range(0, len(test)) ])
print("Classification accuracy = %5.2f%%" % (100.0 * correct_ct / len(test)))


Classification accuracy = 92.50%
