<h1>Import all libraries</h1>

In [30]:
import pandas as pd
import nltk
import gensim
import re
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.metrics.scores import *
from sklearn.metrics import (f1_score,accuracy_score,precision_score,recall_score)

<h1>Split data into 60%(training) 20%(testing) 20%(validating)</h1>

In [19]:
#header = 0 mean row 0(1st row in excel:itemid title Category image_page) is header
all_df = pd.read_csv("train.csv",header = 0)

#shuffle
all_df = all_df.sample(frac = 1)

#separate in to 60% of train, 20% of test, 20% of validate
total_num = len(all_df)

num_of_60_percent_title = int(0.6 * total_num)
num_of_20_percent_title = int(0.20 * total_num)
#print(num_of_60_percent_title)
#print(num_of_20_percent_title)
#print(num_of_60_percent_title+num_of_20_percent_title+num_of_20_percent_title)

train_df = all_df[0:num_of_60_percent_title]
test_df = all_df[num_of_60_percent_title:num_of_60_percent_title+num_of_20_percent_title]
val_df = all_df[num_of_60_percent_title+num_of_20_percent_title:]

#print(len(train_df))
#print(len(test_df))
#print(len(val_df))



<h1>Data preprocessing for training testing and validating data</h1>

In [27]:
stop_list = stopwords.words('english')
stemmer = PorterStemmer()

#for training data
labels = []
corpus = []
for title in train_df['title']:
    sent = nltk.word_tokenize(title)
     
    sent = [w.lower() for w in sent]
    
    sent = [w for w in sent if re.search('^[a-z]+$',w)]

    sent = [w for w in sent if w not in stop_list]

    sent = [stemmer.stem(w) for w in sent]

    corpus.append(sent)

for category in train_df['Category']:
    labels.append(category)
    
# Create a dictionary from the corpus.
dictionary = gensim.corpora.Dictionary(corpus)

# Store the labeled training data in the following list.
labeled_training_data = []
    
# Going through the two lists in parallel to create the labeled data set.
for (l, s) in zip(labels, corpus):

    # Convert the original sentence into a vector.
    vector = dictionary.doc2bow(s)
    
    # Create a dict object to store the document vector (in order to use NLTK's classifier later)
    sent_as_dict = {id:1 for (id, tf) in vector}
    
    # Add the labeled sentence to the labeled data set.
    labeled_training_data.append((sent_as_dict, l))
    
#for testing data
labels = []
corpus = []
for title in test_df['title']:
    sent = nltk.word_tokenize(title)
     
    sent = [w.lower() for w in sent]
    
    sent = [w for w in sent if re.search('^[a-z]+$',w)]

    sent = [w for w in sent if w not in stop_list]

    sent = [stemmer.stem(w) for w in sent]

    corpus.append(sent)

for category in test_df['Category']:
    labels.append(category)
    
# Store the labeled test data in the following list.
labeled_test_data = []
    
# Going through the two lists in parallel to create the labeled data set.
for (l, s) in zip(labels, corpus):

    # Convert the original sentence into a vector.
    vector = dictionary.doc2bow(s)
    
    # Create a dict object to store the document vector (in order to use NLTK's classifier later)
    sent_as_dict = {id:1 for (id, tf) in vector}
    
    # Add the labeled sentence to the labeled data set.
    labeled_test_data.append((sent_as_dict, l))

#print(corpus)    
test_tf_vectors = [dictionary.doc2bow(title) for title in corpus]

# Convert documents into dict representation. This is document-label representation
test_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in test_tf_vectors]
test_labels = labels

#for validating data
labels = []
corpus = []
for title in val_df['title']:
    sent = nltk.word_tokenize(title)
     
    sent = [w.lower() for w in sent]
    
    sent = [w for w in sent if re.search('^[a-z]+$',w)]

    sent = [w for w in sent if w not in stop_list]

    sent = [stemmer.stem(w) for w in sent]

    corpus.append(sent)

for category in val_df['Category']:
    labels.append(category)
    
# Store the labeled test data in the following list.
labeled_val_data = []
    
# Going through the two lists in parallel to create the labeled data set.
for (l, s) in zip(labels, corpus):

    # Convert the original sentence into a vector.
    vector = dictionary.doc2bow(s)
    
    # Create a dict object to store the document vector (in order to use NLTK's classifier later)
    sent_as_dict = {id:1 for (id, tf) in vector}
    
    # Add the labeled sentence to the labeled data set.
    labeled_val_data.append((sent_as_dict, l))
    
#print(corpus)    
val_tf_vectors = [dictionary.doc2bow(title) for title in corpus]

# Convert documents into dict representation. This is document-label representation
val_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in val_tf_vectors]
val_labels = labels
    


<h1>Train Model</h1>

In [21]:
classifier_naive = nltk.NaiveBayesClassifier.train(labeled_training_data)
classifier_maxent = nltk.MaxentClassifier.train(labeled_training_data)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -4.06044        0.000
             2          -1.75992        0.592


  exp_nf_delta = 2 ** nf_delta
  sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)


         Final               nan        0.669


<h1>Prediction for testing</h1>

In [43]:
prediction_for_naive = []
prediction_for_maxent = []
goal_category = test_labels
#For each file, classify and print the label.
for i in range(len(test_df)):
    prediction_for_naive.append(classifier_naive.classify(test_data_as_dict[i]))
    prediction_for_maxent.append(classifier_maxent.classify(test_data_as_dict[i]))
    

<h1>Test Model</h1>

In [45]:
# Test the accuracy.
#print("Accuracy for naive bayes algorithm: ", nltk.classify.accuracy(classifier_naive, labeled_test_data))
#print("Accuracy for maxent algorithm: ", nltk.classify.accuracy(classifier_maxent, labeled_test_data))

print ("Accuracy for naive bayes algorithm: ", accuracy_score(goal_category, prediction_for_naive))
print ("Accuracy for maxent algorithm: ", accuracy_score(goal_category, prediction_for_maxent))

#precision
print("Precision for naive bayes algorithm",precision_score(goal_category, prediction_for_naive , average='macro'))
print("Precision for maxent algorithm",precision_score(goal_category, prediction_for_maxent , average='macro'))

#recall
print("Recall for naive bayes algorithm",recall_score(goal_category, prediction_for_naive , average='macro'))
print("Recall for maxent algorithm",recall_score(goal_category, prediction_for_maxent , average='macro'))


#f1 score
print("F1 score for naive bayes algorithm",f1_score(goal_category, prediction_for_naive , average='macro'))
print("F1 score for maxent algorithm",f1_score(goal_category, prediction_for_maxent , average='macro'))


Accuracy for naive bayes algorithm:  0.609677249986874
Accuracy for maxent algorithm:  0.6470451459988149
Precision for naive bayes algorithm 0.53221047910089
Precision for maxent algorithm 0.6553976966644761
Recall for naive bayes algorithm 0.6636719238012163
Recall for maxent algorithm 0.42818639941411446
F1 score for naive bayes algorithm 0.5542114758093354
F1 score for maxent algorithm 0.46781032688531593


<h1>Prediction for validating</h1>

In [None]:
prediction_for_naive = []
prediction_for_maxent = []
goal_category = val_labels
#For each file, classify and print the label.
for i in range(len(test_df)):
    prediction_for_naive.append(classifier_naive.classify(val_data_as_dict[i]))
    prediction_for_maxent.append(classifier_maxent.classify(val_data_as_dict[i]))

<h1>Validating Model</h1>

In [46]:
# Test the accuracy.
#print("Accuracy for naive bayes algorithm: ", nltk.classify.accuracy(classifier_naive, labeled_test_data))
#print("Accuracy for maxent algorithm: ", nltk.classify.accuracy(classifier_maxent, labeled_test_data))

print ("Accuracy for naive bayes algorithm: ", accuracy_score(goal_category, prediction_for_naive))
print ("Accuracy for maxent algorithm: ", accuracy_score(goal_category, prediction_for_maxent))

#precision
print("Precision for naive bayes algorithm",precision_score(goal_category, prediction_for_naive , average='macro'))
print("Precision for maxent algorithm",precision_score(goal_category, prediction_for_maxent , average='macro'))

#recall
print("Recall for naive bayes algorithm",recall_score(goal_category, prediction_for_naive , average='macro'))
print("Recall for maxent algorithm",recall_score(goal_category, prediction_for_maxent , average='macro'))


#f1 score
print("F1 score for naive bayes algorithm",f1_score(goal_category, prediction_for_naive , average='macro'))
print("F1 score for maxent algorithm",f1_score(goal_category, prediction_for_maxent , average='macro'))