In [11]:
#importing required packages
import numpy as np
import pandas as pd

In [12]:
#load train files as .csv files
train_label = pd.read_csv('train_label.csv', header = None)
train_data = pd.read_csv('train_data.csv', header = None)

#load vocabulary and map files
voc = pd.read_csv('vocabulary.txt', header= None)
map_data = pd.read_csv('map.csv')

#cleaning vocabulary dataframe
voc['wordID'] = np.arange(1, len(voc) + 1)
voc.index.names = ['wordID']
voc.index = voc.index + 1
voc['wordID'] = voc.index

#cleaning training files dataframes
train_data.columns = ['docID', 'wordID', 'count']
train_label.columns = ['categoryID_true']
map_data.columns = ['categoryID', 'categoryName']
train_label['docID'] = train_label.index

In [13]:
#obtaining the range for documents ID for each class
y = []
for i in range(1,21):
       y.append(train_label[train_label['categoryID_true'] == i].iloc[0,1])
        
y[0] = 0
y.insert(20, len(train_label))
y = pd.Series(y)

In [14]:
#assigning a new columns for train data which indicates corresponding categories
x = []
l = np.array(range(0,20))
train_data['cat'] = pd.cut(train_data['docID'], bins = y, labels = l)

#total number of words in all documents for each category
n = train_data.groupby(['cat']).sum()['count']

#wordsID occuring in each category for all documents
train_data4 = train_data.iloc[:,1:].groupby(['cat','wordID']).sum().dropna() 

In [15]:
train = train_data4.reset_index()

In [16]:
#appending words to train data which were previously absent
#assigning zero count to previously absent words 
pp = []
df = pd.DataFrame([])
for i in range (0,20):
    t = train[train['cat'] == i]
    p = pd.merge(t,voc, how = 'outer')
    p['count'].fillna(0, inplace=True)
    p['cat'].fillna(i,inplace =True)
    p = p.sort_values(by='wordID')
    p = p.reset_index(drop=True)
    p.drop(0, axis = 1, inplace =True)
    df = pd.DataFrame(p)
    pp.append(df)
p.sort_values(by = 'wordID')
master_df = pd.concat(pp)
master_df = master_df.groupby(['cat','wordID']).sum()

In [17]:
#total number of documents in each categories and calculating prior probabilities
totdoc = len(train_label)
doc_j = []
for item in range(1,21):
     x = doc_j.append(len(train_label[train_label['categoryID_true'] == item]))
        
doc_j = pd.DataFrame(doc_j)
doc_j['prior'] = np.log(doc_j/totdoc)
doc_j['# docs in cat i'] = doc_j[0]
doc_j.drop(0, axis=1, inplace=True)
doc_j = doc_j[['# docs in cat i', 'prior']]
train_label['docID'] = train_label['docID'] + 1
train_label =train_label.set_index('docID')

doc_j

Unnamed: 0,# docs in cat i,prior
0,480,-3.156025
1,581,-2.96506
2,572,-2.980672
3,587,-2.954786
4,575,-2.975441
5,592,-2.946304
6,582,-2.96334
7,592,-2.946304
8,596,-2.93957
9,594,-2.942932


In [18]:
td = master_df.groupby(level = [0]).sum()

In [19]:
#calculating Bayesian estimator and Maximum likelihood estimator 
z_BE = []
z_MLE = []
for i in range (0,20):
    BE = (1+master_df[master_df.index.get_level_values('cat') == i])/(td[td.index.get_level_values('cat') == i] + len(voc))
    MLE = master_df[master_df.index.get_level_values('cat') == i]/td[td.index.get_level_values('cat') == i]
    z_BE.append(BE)
    z_MLE.append(MLE)
    master_dfBE = np.log(pd.concat(z_BE))
    master_dfMLE = np.log(pd.concat(z_MLE))

  # Remove the CWD from sys.path while we load stuff.


In [20]:
master_dfBE = master_dfBE.reset_index()
master_dfBE.columns = [['cat', 'wordID', 'BE']]

In [21]:
master_dfMLE = master_dfMLE.reset_index()
master_dfMLE.columns = [['cat', 'wordID', 'MLE']]

In [22]:
#training the model on training data
z = pd.DataFrame()
for i in range (0,20):
    m = master_dfBE[master_dfBE['cat'] == i]
    common = pd.merge(m, train_data, on=['wordID']).dropna()
    common = common.groupby(['docID', 'wordID']).sum()
    c = pd.DataFrame(common.groupby(level = 'docID')['BE'].sum())
    c = c.reset_index(['docID'])
    c = c.drop_duplicates(subset= ['docID'], keep = 'first')
    c = c.reset_index(drop = True)
    c[str(i + 1)] = (c['BE'] + doc_j.iloc[i,1])
    c.drop('BE', axis = 1, inplace = True)
    z = pd.DataFrame(pd.concat([c,z], axis =1))
z = z.T.drop_duplicates().T
z = z.set_index('docID')
final_df = pd.DataFrame(z.apply(lambda x: x.argmax(), axis=1))
final_df.columns = ['categoryID_predict']
final_df = final_df.astype(int)

In [13]:
#overall accuracy for performance evaluation on training data
f = pd.concat([final_df,train_label], axis = 1)
(len(train_label)- len(f[f['categoryID_true'] != f['categoryID_predict']]))/len(train_label)*100

94.81764131688703

In [14]:
#constructing the required matrix for confusion matrix (1)
true = []
false = []
for i in range (1,21):
    x = len(f[(f['categoryID_predict'] == f['categoryID_true']) & (f['categoryID_true'] == i)])
    true.append(x)

ConfusionMatrix = pd.DataFrame(np.diag(true))

In [15]:
#constructing the required matrix for confusion matrix (2)
for i in range(1,21):
    for j in range (1,21):
        z = f[(f['categoryID_predict'] != f['categoryID_true']) & (f['categoryID_true'] == i) & (f['categoryID_predict'] == j)]
        if not z.empty: 
            false.append(z)
            ConfusionMatrix.iloc[i-1, j-1] = len(z)

In [16]:
ConfusionMatrix.columns = ConfusionMatrix.columns + 1
ConfusionMatrix.index = ConfusionMatrix.index + 1

In [17]:
ConfusionMatrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
1,471,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,1,1,2
2,0,537,6,15,1,11,2,1,1,0,0,2,1,0,3,1,0,0,0,0
3,1,10,510,23,0,18,2,0,0,0,0,3,1,1,0,2,0,0,1,0
4,0,12,4,547,3,5,6,0,0,0,0,2,3,0,1,1,1,1,1,0
5,1,4,2,5,549,2,0,0,2,0,0,2,1,3,1,1,0,0,2,0
6,1,12,8,4,2,557,0,0,1,1,0,1,0,0,2,1,1,0,1,0
7,1,4,0,30,6,1,473,20,1,3,3,10,13,3,1,3,5,1,4,0
8,1,0,0,2,1,2,3,570,1,1,0,1,1,1,0,1,2,0,4,1
9,1,1,0,1,1,0,4,2,579,0,0,0,0,2,0,2,2,0,1,0
10,0,3,0,1,0,1,1,2,0,579,4,0,1,1,0,0,1,0,0,0


In [18]:
#class accuracies for trained model
ConfusionMatrix2 = pd.DataFrame(ConfusionMatrix.max(axis = 1)/ConfusionMatrix.sum(axis =1))
ConfusionMatrix2.columns = ['Accuracy']
ConfusionMatrix2

Unnamed: 0,Accuracy
1,0.98125
2,0.924269
3,0.891608
4,0.931857
5,0.954783
6,0.940878
7,0.812715
8,0.962838
9,0.971477
10,0.974747


In [19]:
#loading and cleaning test data
test_label = pd.read_csv('test_label.csv', header = None)
test_data = pd.read_csv('test_data.csv', header = None)
test_label['docID'] = test_label.index + 1
test_label.columns = [['categoryID_true', 'docID']]
test_data.columns = ['docID', 'wordID', 'count']

In [20]:
#predicting the categories for each document using BE
z2 = pd.DataFrame()
for i in range (0,20):
    m2 = master_dfBE[master_dfBE['cat'] == i]
    common2 = pd.merge(m2, test_data, on=['wordID'])
    c2 = common2.groupby(['docID'])['BE'].sum()
    c2 = c2.reset_index(['docID'])
    c3 = pd.DataFrame((c2['BE'] + doc_j.iloc[i,1]))
    z2 = pd.DataFrame(pd.concat([z2,c3], axis =1))
z2.columns = np.array(range(1,21))
final_df2 = pd.DataFrame(z2.apply(lambda x: x.argmax(), axis=1))
final_df2.columns = ['categoryID_predict']
final_df2 = final_df2.astype(int)

In [21]:
#evaluate the performance on test data
f2 = pd.concat([final_df2,test_label], axis = 1)
(len(test_label)- len(f2[f2['categoryID_true'] != f2['categoryID_predict']]))/len(test_label)*100

78.73417721518987

In [22]:
#confusion matrix construction
true = []
false = []
for i in range (1,21):
    x = len(f2[(f2['categoryID_predict'] == f2['categoryID_true']) & (f2['categoryID_true'] == i)])
    true.append(x)

ConfusionMatrix_test = pd.DataFrame(np.diag(true))

In [23]:
#confusion matrix construction
for i in range(1,21):
    for j in range (1,21):
        z = f[(f['categoryID_predict'] != f['categoryID_true']) & (f['categoryID_true'] == i) & (f['categoryID_predict'] == j)]
        if not z.empty: 
            false.append(z)
            ConfusionMatrix_test.iloc[i-1, j-1] = len(z)

In [24]:
ConfusionMatrix_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,246,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,1,1,2
1,0,296,6,15,1,11,2,1,1,0,0,2,1,0,3,1,0,0,0,0
2,1,10,201,23,0,18,2,0,0,0,0,3,1,1,0,2,0,0,1,0
3,0,12,4,305,3,5,6,0,0,0,0,2,3,0,1,1,1,1,1,0
4,1,4,2,5,275,2,0,0,2,0,0,2,1,3,1,1,0,0,2,0
5,1,12,8,4,2,302,0,0,1,1,0,1,0,0,2,1,1,0,1,0
6,1,4,0,30,6,1,255,20,1,3,3,10,13,3,1,3,5,1,4,0
7,1,0,0,2,1,2,3,350,1,1,0,1,1,1,0,1,2,0,4,1
8,1,1,0,1,1,0,4,2,359,0,0,0,0,2,0,2,2,0,1,0
9,0,3,0,1,0,1,1,2,0,358,4,0,1,1,0,0,1,0,0,0


In [25]:
#class accuracies for test data
ConfusionMatrix_test2 = pd.DataFrame(ConfusionMatrix_test.max(axis = 1)/ConfusionMatrix_test.sum(axis =1))
ConfusionMatrix_test2.columns = ['Accuracy']
ConfusionMatrix_test2

Unnamed: 0,Accuracy
0,0.964706
1,0.870588
2,0.764259
3,0.884058
4,0.913621
5,0.896142
6,0.700549
7,0.94086
8,0.954787
9,0.959786


In [26]:
#evaluation of the model performance on test data using 
z3 = pd.DataFrame()
for i in range (0,20):
    m3 = master_dfMLE[master_dfMLE['cat'] == i]
    common3 = pd.merge(m3, test_data, on=['wordID'])
    c3 = common3.groupby(['docID'])['MLE'].sum()
    c3 = c3.reset_index(['docID'])
    c3 = pd.DataFrame((c3['MLE'] + doc_j.iloc[i,1]))
    z3 = pd.DataFrame(pd.concat([z3,c3], axis =1))
z3.columns = np.array(range(1,21))
final_df3 = pd.DataFrame(z3.apply(lambda x: x.argmax(), axis=1)).dropna()
final_df3.columns = ['categoryID_predict']
final_df3 = final_df3.astype(int)

In [27]:
f3 = pd.concat([final_df3,test_label], axis = 1)
(len(test_label)- len(f3[f3['categoryID_true'] != f3['categoryID_predict']]))/len(test_label)*100

5.463024650233178

In [28]:
true = []
false = []
for i in range (1,21):
    x = len(f3[(f3['categoryID_predict'] == f3['categoryID_true']) & (f3['categoryID_true'] == i)])
    true.append(x)

ConfusionMatrix_testMLE = pd.DataFrame(np.diag(true))

In [29]:
for i in range(1,21):
    for j in range (1,21):
        z = f3[(f3['categoryID_predict'] != f3['categoryID_true']) & (f3['categoryID_true'] == i) & (f3['categoryID_predict'] == j)]
        if not z.empty: 
            false.append(z)
            ConfusionMatrix_testMLE.iloc[i-1, j-1] = len(z)

In [30]:
ConfusionMatrix_testMLE

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
1,0,26,2,1,1,4,0,0,0,0,0,0,1,0,2,0,0,0,0,0
2,0,3,19,6,3,1,0,0,0,0,0,1,1,2,0,0,1,0,0,0
3,0,3,5,28,2,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,2,0,2,22,0,1,0,0,0,0,0,4,1,0,0,0,0,0,0
5,0,3,0,1,1,33,0,0,0,0,0,0,0,0,0,0,1,0,0,0
6,0,4,2,3,1,1,47,2,2,0,0,0,0,0,1,0,1,0,0,1
7,0,0,0,0,0,0,1,19,2,0,0,1,2,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,1,20,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,21,3,0,0,0,0,0,0,0,0,0


In [31]:
ConfusionMatrix_testMLE2 = pd.DataFrame(ConfusionMatrix_testMLE.max(axis = 1)/ConfusionMatrix_testMLE.sum(axis =1))
ConfusionMatrix_testMLE2.columns = ['Accuracy']
ConfusionMatrix_testMLE2

Unnamed: 0,Accuracy
0,0.875
1,0.702703
2,0.513514
3,0.666667
4,0.6875
5,0.846154
6,0.723077
7,0.730769
8,0.952381
9,0.84
