# CSI5386 Natural Language Processing
## Project - Automatic Classification of Poems by Themes
### Aaditya Suri, Ranjan Goyal, Paritosh Pal Singh

In [4]:
#import data
import pandas as pd
import numpy as np

In [25]:
import os
pardir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
topics = os.path.join(pardir, "Project", "data", "topics")
anger = os.path.join(topics, "anger")
animals = os.path.join(topics, "animal")
children = os.path.join(topics, "children")
father = os.path.join(topics, "father")
friend = os.path.join(topics, "friend")
river = os.path.join(topics, "river")
romance = os.path.join(topics, "romance")
sky = os.path.join(topics, "sky")
water = os.path.join(topics, "water")
winter = os.path.join(topics, "winter")

In [28]:
def fetch_poems(path):
    poems_arr = []
    for p in os.listdir(path):
        poem_path = os.path.join(path, p)
        f = open(poem_path)
        poem_text = ""
        for i in f:
            poem_text += i.strip() + "\t"
        poems_arr.append(poem_text)
        f.close()
    return poems_arr

In [29]:
p_anger = fetch_poems(anger)
p_animal = fetch_poems(animals)
p_children = fetch_poems(children)
p_father = fetch_poems(father)
p_friend = fetch_poems(friend)
p_river = fetch_poems(river)
p_romance = fetch_poems(romance)
p_sky = fetch_poems(sky)
p_water = fetch_poems(water)
p_winter = fetch_poems(winter)

In [30]:
data = []
p_array = [p_anger, p_animal, p_children, p_father, p_friend, p_river, p_romance, p_sky, p_water, p_winter]
data_labels = []
for i in range(0, len(p_array)):
    data.extend(p_array[i])
    data_labels.extend([i]*len(p_array[i]))
data = np.array(data)
data_labels = np.array(data_labels)

In [90]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, data_labels, test_size = 0.2)

In [91]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((797,), (200,), (797,), (200,))

In [92]:
# run this in conda terminal
# conda install -c conda-forge spacy
# python -m spacy download en_core_web_sm

In [93]:
def vectors(nlp, data):
    vectors = []
    for row in data:
        row = str(row)
        doc = nlp(row)
        vectors.append(doc.vector)
    return np.array(vectors)

In [94]:
import spacy
nlp = spacy.load('en_core_web_md')
train_vectors = vectors(nlp, X_train)

In [95]:
test_vectors = vectors(nlp, X_test)

## Random Forest Classifier

In [96]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
def evaluate(y_true, y_pred):
    # print evaluation results for model
    # print confusion matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    # accuracy
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    # precision
    print("Precision: ", precision_score(y_true, y_pred, average = 'weighted'))
    # recall
    print("Recall ", recall_score(y_true, y_pred, average = 'weighted'))
    # f-measure
    print("F1: ", f1_score(y_true, y_pred, average = 'weighted'))

In [97]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 200)
rf_model.fit(train_vectors, y_train)
y_pred_rf = rf_model.predict(test_vectors)

In [98]:
evaluate(y_test, y_pred_rf)

Confusion Matrix:
[[20  2  1  0  0  0  0  0  0  0]
 [ 4 15  1  0  1  0  3  0  0  0]
 [ 1  2 13  3  0  0  1  0  0  2]
 [ 1  0  2 10  4  0  0  0  0  1]
 [ 2  0  2  1  7  3  1  1  1  1]
 [ 1  0  0  0  0 14  1  2  1  2]
 [ 0  0  0  2  3  1  8  2  0  1]
 [ 0  0  0  0  1  1  1  9  0  2]
 [ 1  0  1  1  3  4  0  1 10  2]
 [ 0  0  0  0  2  1  1  2  0 13]]
Accuracy:  0.595
Precision:  0.6156118421052632
Recall  0.595
F1:  0.5936339546017354


## SVM Classifier

In [99]:
from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(train_vectors, y_train)
y_pred_svm = svm_model.predict(test_vectors)

In [100]:
evaluate(y_test, y_pred_svm)

Confusion Matrix:
[[18  2  1  0  1  0  0  0  0  1]
 [ 3 11  2  2  3  0  3  0  0  0]
 [ 0  0 17  0  2  0  1  0  0  2]
 [ 0  0  1  7  9  0  0  0  0  1]
 [ 1  0  2  1  8  3  2  1  0  1]
 [ 1  0  0  0  1 12  0  2  0  5]
 [ 0  0  0  1  4  1  8  1  0  2]
 [ 0  1  0  0  1  0  2  6  0  4]
 [ 0  0  1  0  3  4  2  1  6  6]
 [ 0  0  0  0  2  1  1  1  0 14]]
Accuracy:  0.535
Precision:  0.6245619675302337
Recall  0.535
F1:  0.5389408665273593


## CatBoost Classifier

In [101]:
from catboost import CatBoostClassifier
cb_model = CatBoostClassifier()
cb_model.fit(train_vectors, y_train)
y_pred_cb = cb_model.predict(test_vectors)

Learning rate set to 0.078216
0:	learn: 2.2676887	total: 130ms	remaining: 2m 10s
1:	learn: 2.2363092	total: 265ms	remaining: 2m 12s
2:	learn: 2.2009664	total: 397ms	remaining: 2m 11s
3:	learn: 2.1636493	total: 530ms	remaining: 2m 11s
4:	learn: 2.1376829	total: 657ms	remaining: 2m 10s
5:	learn: 2.1093778	total: 797ms	remaining: 2m 12s
6:	learn: 2.0815135	total: 931ms	remaining: 2m 12s
7:	learn: 2.0526458	total: 1.06s	remaining: 2m 11s
8:	learn: 2.0229530	total: 1.19s	remaining: 2m 10s
9:	learn: 1.9936711	total: 1.32s	remaining: 2m 10s
10:	learn: 1.9682733	total: 1.47s	remaining: 2m 12s
11:	learn: 1.9407877	total: 1.63s	remaining: 2m 13s
12:	learn: 1.9205791	total: 1.77s	remaining: 2m 14s
13:	learn: 1.8966448	total: 1.94s	remaining: 2m 16s
14:	learn: 1.8700209	total: 2.1s	remaining: 2m 17s
15:	learn: 1.8453529	total: 2.24s	remaining: 2m 17s
16:	learn: 1.8207772	total: 2.38s	remaining: 2m 17s
17:	learn: 1.7959569	total: 2.53s	remaining: 2m 18s
18:	learn: 1.7748922	total: 2.67s	remaining: 

159:	learn: 0.4725023	total: 23.1s	remaining: 2m 1s
160:	learn: 0.4685324	total: 23.3s	remaining: 2m 1s
161:	learn: 0.4654748	total: 23.4s	remaining: 2m 1s
162:	learn: 0.4615867	total: 23.5s	remaining: 2m
163:	learn: 0.4575537	total: 23.7s	remaining: 2m
164:	learn: 0.4543087	total: 23.8s	remaining: 2m
165:	learn: 0.4516844	total: 23.9s	remaining: 2m
166:	learn: 0.4480558	total: 24.1s	remaining: 2m
167:	learn: 0.4449699	total: 24.2s	remaining: 1m 59s
168:	learn: 0.4414010	total: 24.4s	remaining: 1m 59s
169:	learn: 0.4392561	total: 24.5s	remaining: 1m 59s
170:	learn: 0.4361948	total: 24.6s	remaining: 1m 59s
171:	learn: 0.4316876	total: 24.8s	remaining: 1m 59s
172:	learn: 0.4283400	total: 24.9s	remaining: 1m 58s
173:	learn: 0.4248920	total: 25s	remaining: 1m 58s
174:	learn: 0.4227611	total: 25.2s	remaining: 1m 58s
175:	learn: 0.4188546	total: 25.3s	remaining: 1m 58s
176:	learn: 0.4154432	total: 25.4s	remaining: 1m 58s
177:	learn: 0.4129723	total: 25.6s	remaining: 1m 58s
178:	learn: 0.4093

315:	learn: 0.1941246	total: 45.9s	remaining: 1m 39s
316:	learn: 0.1933861	total: 46s	remaining: 1m 39s
317:	learn: 0.1923208	total: 46.1s	remaining: 1m 38s
318:	learn: 0.1913711	total: 46.3s	remaining: 1m 38s
319:	learn: 0.1906072	total: 46.4s	remaining: 1m 38s
320:	learn: 0.1898614	total: 46.6s	remaining: 1m 38s
321:	learn: 0.1890851	total: 46.7s	remaining: 1m 38s
322:	learn: 0.1882602	total: 46.9s	remaining: 1m 38s
323:	learn: 0.1874566	total: 47s	remaining: 1m 38s
324:	learn: 0.1864790	total: 47.2s	remaining: 1m 37s
325:	learn: 0.1854285	total: 47.3s	remaining: 1m 37s
326:	learn: 0.1847749	total: 47.5s	remaining: 1m 37s
327:	learn: 0.1842233	total: 47.6s	remaining: 1m 37s
328:	learn: 0.1832132	total: 47.7s	remaining: 1m 37s
329:	learn: 0.1824049	total: 47.9s	remaining: 1m 37s
330:	learn: 0.1815909	total: 48s	remaining: 1m 37s
331:	learn: 0.1806537	total: 48.2s	remaining: 1m 36s
332:	learn: 0.1800414	total: 48.3s	remaining: 1m 36s
333:	learn: 0.1794306	total: 48.5s	remaining: 1m 36s

471:	learn: 0.1091946	total: 1m 7s	remaining: 1m 15s
472:	learn: 0.1087778	total: 1m 7s	remaining: 1m 15s
473:	learn: 0.1084360	total: 1m 8s	remaining: 1m 15s
474:	learn: 0.1080928	total: 1m 8s	remaining: 1m 15s
475:	learn: 0.1077454	total: 1m 8s	remaining: 1m 15s
476:	learn: 0.1073291	total: 1m 8s	remaining: 1m 14s
477:	learn: 0.1070094	total: 1m 8s	remaining: 1m 14s
478:	learn: 0.1067567	total: 1m 8s	remaining: 1m 14s
479:	learn: 0.1063527	total: 1m 8s	remaining: 1m 14s
480:	learn: 0.1059925	total: 1m 8s	remaining: 1m 14s
481:	learn: 0.1057207	total: 1m 9s	remaining: 1m 14s
482:	learn: 0.1053539	total: 1m 9s	remaining: 1m 14s
483:	learn: 0.1051913	total: 1m 9s	remaining: 1m 13s
484:	learn: 0.1049275	total: 1m 9s	remaining: 1m 13s
485:	learn: 0.1046700	total: 1m 9s	remaining: 1m 13s
486:	learn: 0.1043218	total: 1m 9s	remaining: 1m 13s
487:	learn: 0.1040950	total: 1m 9s	remaining: 1m 13s
488:	learn: 0.1040436	total: 1m 9s	remaining: 1m 13s
489:	learn: 0.1037073	total: 1m 10s	remaining:

627:	learn: 0.0745429	total: 1m 29s	remaining: 53.1s
628:	learn: 0.0743647	total: 1m 29s	remaining: 53s
629:	learn: 0.0741772	total: 1m 30s	remaining: 52.9s
630:	learn: 0.0740163	total: 1m 30s	remaining: 52.7s
631:	learn: 0.0738866	total: 1m 30s	remaining: 52.6s
632:	learn: 0.0737376	total: 1m 30s	remaining: 52.4s
633:	learn: 0.0735921	total: 1m 30s	remaining: 52.3s
634:	learn: 0.0734233	total: 1m 30s	remaining: 52.2s
635:	learn: 0.0732875	total: 1m 30s	remaining: 52s
636:	learn: 0.0731423	total: 1m 31s	remaining: 51.9s
637:	learn: 0.0729782	total: 1m 31s	remaining: 51.7s
638:	learn: 0.0727931	total: 1m 31s	remaining: 51.6s
639:	learn: 0.0726266	total: 1m 31s	remaining: 51.5s
640:	learn: 0.0724804	total: 1m 31s	remaining: 51.3s
641:	learn: 0.0724518	total: 1m 31s	remaining: 51.2s
642:	learn: 0.0724214	total: 1m 31s	remaining: 51.1s
643:	learn: 0.0722102	total: 1m 32s	remaining: 50.9s
644:	learn: 0.0720398	total: 1m 32s	remaining: 50.8s
645:	learn: 0.0718422	total: 1m 32s	remaining: 50.

783:	learn: 0.0572713	total: 1m 52s	remaining: 30.9s
784:	learn: 0.0572448	total: 1m 52s	remaining: 30.7s
785:	learn: 0.0572199	total: 1m 52s	remaining: 30.6s
786:	learn: 0.0571320	total: 1m 52s	remaining: 30.4s
787:	learn: 0.0570678	total: 1m 52s	remaining: 30.3s
788:	learn: 0.0569825	total: 1m 52s	remaining: 30.2s
789:	learn: 0.0569047	total: 1m 52s	remaining: 30s
790:	learn: 0.0567899	total: 1m 53s	remaining: 29.9s
791:	learn: 0.0566678	total: 1m 53s	remaining: 29.7s
792:	learn: 0.0565911	total: 1m 53s	remaining: 29.6s
793:	learn: 0.0565735	total: 1m 53s	remaining: 29.5s
794:	learn: 0.0565589	total: 1m 53s	remaining: 29.3s
795:	learn: 0.0564501	total: 1m 53s	remaining: 29.2s
796:	learn: 0.0563226	total: 1m 54s	remaining: 29s
797:	learn: 0.0562006	total: 1m 54s	remaining: 28.9s
798:	learn: 0.0561791	total: 1m 54s	remaining: 28.8s
799:	learn: 0.0560361	total: 1m 54s	remaining: 28.6s
800:	learn: 0.0560039	total: 1m 54s	remaining: 28.5s
801:	learn: 0.0559990	total: 1m 54s	remaining: 28.

941:	learn: 0.0466474	total: 2m 15s	remaining: 8.37s
942:	learn: 0.0466425	total: 2m 16s	remaining: 8.22s
943:	learn: 0.0466325	total: 2m 16s	remaining: 8.08s
944:	learn: 0.0465720	total: 2m 16s	remaining: 7.94s
945:	learn: 0.0465361	total: 2m 16s	remaining: 7.79s
946:	learn: 0.0465054	total: 2m 16s	remaining: 7.65s
947:	learn: 0.0464977	total: 2m 16s	remaining: 7.5s
948:	learn: 0.0464161	total: 2m 16s	remaining: 7.36s
949:	learn: 0.0463286	total: 2m 17s	remaining: 7.22s
950:	learn: 0.0462613	total: 2m 17s	remaining: 7.07s
951:	learn: 0.0461924	total: 2m 17s	remaining: 6.93s
952:	learn: 0.0461411	total: 2m 17s	remaining: 6.78s
953:	learn: 0.0460729	total: 2m 17s	remaining: 6.64s
954:	learn: 0.0460041	total: 2m 17s	remaining: 6.5s
955:	learn: 0.0459665	total: 2m 18s	remaining: 6.35s
956:	learn: 0.0459250	total: 2m 18s	remaining: 6.21s
957:	learn: 0.0458490	total: 2m 18s	remaining: 6.06s
958:	learn: 0.0457706	total: 2m 18s	remaining: 5.92s
959:	learn: 0.0456918	total: 2m 18s	remaining: 5

In [102]:
evaluate(y_test, y_pred_cb)

Confusion Matrix:
[[20  2  0  0  1  0  0  0  0  0]
 [ 3 17  0  0  2  0  2  0  0  0]
 [ 2  4 12  2  0  0  0  0  0  2]
 [ 1  0  0 13  3  0  0  0  0  1]
 [ 2  0  1  2  7  2  1  1  2  1]
 [ 1  0  0  0  0 15  1  2  2  0]
 [ 0  0  0  1  3  0 11  1  0  1]
 [ 0  2  0  0  0  0  0  9  0  3]
 [ 2  0  1  1  1  3  0  1 13  1]
 [ 0  1  0  1  0  0  1  2  0 14]]
Accuracy:  0.655
Precision:  0.666888211620283
Recall  0.655
F1:  0.6522211786240669
