# <center>Unstructured Data Analysis</center>

MASNA 2019 Students <br>
Baranova Anastasia <br>
Gutman Irina

In [29]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('wordnet')
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
import pymorphy2
from nltk.corpus import stopwords
morphA = pymorphy2.MorphAnalyzer()
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords as nltk_stopwords
stopwords = set(nltk_stopwords.words('russian') )

[nltk_data] Downloading package wordnet to /home/an/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/an/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Before classification we need to read the data scraped previously and take two columns from it: annotation and book_class (labels).

In [30]:
data=pd.read_csv('data_for_classification.csv')
data

Unnamed: 0.1,Unnamed: 0,genre,annotation,title,book_class
0,0,"['sf_social', 'sf_history']","Мир на пороховой бочке, и несколько раз за век...",Армагед-дом,other
1,1,['magician_book'],В довольно-таки мрачном фэнтезийном мире зарож...,Скрут,other
2,2,['magician_book'],Ты можешь летать на птице или нырять на оседла...,Варан,other
3,3,"['sf_space', 'narrative']","История одного батальона, отслеженная писателе...",Десант на Счастье,other
4,4,"['sf_space', 'narrative']",Долгие годы последствия Галактической войны бу...,Повторная колонизация,other
...,...,...,...,...,...
101096,116606,"['det_action', 'adv_history']","В основу романа положен реальный, хотя и окруж...",Секретный рейд адмирала Брэда,other
101097,116607,['ref_dict'],Предлагаемый немецко-русский русско-немецкий м...,"Немецко-русский, русско-немецкий мини-словарь ...",other
101098,116608,['geo_guides'],"Сочи всегда был, есть и остается главным курор...",Олимпийский Сочи,other
101099,116609,"['child_education', 'sci_linguistic']",Вслед за книгой «Рисую узоры» данное пособие п...,Пишу буквы. Для одаренных детей 5-6 лет,science


In [31]:
d = data[['annotation', 'book_class']]

In [32]:
X, y = d.annotation, d.book_class

After assigning annotations and labels, we need to normalize our texts.

In [33]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[а-яА-Яa-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[а-яА-Яa-zA-Z]\s+', ' ', document)
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()
    document = [morphA.parse(word)[0].normal_form for word in document] 

    document = ' '.join(document)
    documents.append(document)

Next we vectorize the normalized annotations and create Bag of Words.

In [None]:

vectorizer = CountVectorizer(max_features=1500, stop_words=stopwords)
X = vectorizer.fit_transform(documents).toarray()

Next we use TF-IDF in order to take into account the frequencies of words.

In [35]:

tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

Then we split our data into train and test data (80/20).

In [36]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Next we decided to use different classifiers for our purpose of creating a model which can predict the genre/topic of a book by its annotation.

### Logistic Regression

In [None]:

classifier = LogisticRegression()
classifier.fit(X_train, y_train)


In [38]:
y_pred = classifier.predict(X_test)

In [39]:
score = classifier.score(X_test, y_test)

print("Accuracy: ", score)
print(classification_report(y_test,y_pred))

Accuracy:  0.866030364472578
              precision    recall  f1-score   support

       other       0.89      0.93      0.91     14581
     science       0.80      0.70      0.74      5640

    accuracy                           0.87     20221
   macro avg       0.84      0.81      0.83     20221
weighted avg       0.86      0.87      0.86     20221



The accuracy is 0.86, which is quite good.

In [40]:
#Saving the model
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

### Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 


RandomForestClassifier(n_estimators=1000, random_state=0)

In [42]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[14045   536]
 [ 1761  3879]]
              precision    recall  f1-score   support

       other       0.89      0.96      0.92     14581
     science       0.88      0.69      0.77      5640

    accuracy                           0.89     20221
   macro avg       0.88      0.83      0.85     20221
weighted avg       0.89      0.89      0.88     20221

0.8864052222936551


Random Forest took more time but it shows a slightly better result: 0.88.

### Linear Classifiers

In [43]:
from sklearn.linear_model import SGDClassifier

classifier = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
y_pred = classifier.predict(X_test)

print("Accuracy: ", score)
print(classification_report(y_test,y_pred))

Accuracy:  0.8632609663221403
              precision    recall  f1-score   support

       other       0.88      0.93      0.91     14581
     science       0.80      0.68      0.74      5640

    accuracy                           0.86     20221
   macro avg       0.84      0.81      0.82     20221
weighted avg       0.86      0.86      0.86     20221



In [44]:
classifier = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=50)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
y_pred = classifier.predict(X_test)

print("Accuracy: ", score)
print(classification_report(y_test,y_pred))

Accuracy:  0.8653874684733692
              precision    recall  f1-score   support

       other       0.88      0.94      0.91     14581
     science       0.81      0.67      0.74      5640

    accuracy                           0.87     20221
   macro avg       0.85      0.81      0.82     20221
weighted avg       0.86      0.87      0.86     20221



In [45]:
classifier = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=50)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
y_pred = classifier.predict(X_test)

print("Accuracy: ", score)
print(classification_report(y_test,y_pred))

Accuracy:  0.8670688887789921
              precision    recall  f1-score   support

       other       0.90      0.91      0.91     14581
     science       0.77      0.75      0.76      5640

    accuracy                           0.87     20221
   macro avg       0.84      0.83      0.83     20221
weighted avg       0.87      0.87      0.87     20221



In [46]:
classifier = SGDClassifier(loss="log", penalty="elasticnet", max_iter=50)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
y_pred = classifier.predict(X_test)

print("Accuracy: ", score)
print(classification_report(y_test,y_pred))

Accuracy:  0.8523317343355917
              precision    recall  f1-score   support

       other       0.86      0.95      0.90     14581
     science       0.82      0.60      0.69      5640

    accuracy                           0.85     20221
   macro avg       0.84      0.78      0.80     20221
weighted avg       0.85      0.85      0.84     20221



### CatBoost

In [49]:

from sklearn.metrics import confusion_matrix
from catboost import CatBoostClassifier

n_list=[]
k_list=[]
F_value_list=[]
conf_mat =[]

# Finding best parametrs of the model with the weights. Weights are needed since we have unbalanced classes.
for n in range (30,60,5): #was found by experimental knowledge
    n_list.append(n)
    #
    k=100-n
    k_list.append(k)
    #
    CatBoost_model = CatBoostClassifier(iterations=250,class_weights=[n,k])
    CatBoost_model.fit(X_train,y_train,silent = True)
    y_pred_lc = CatBoost_model.predict(X_test)
    cm_CatBoost_model = confusion_matrix(y_test,y_pred_lc)
    conf_mat.append(cm_CatBoost_model)
    #
    numerator = cm_CatBoost_model[1][1]+cm_CatBoost_model[0][0] #True Posotive + False Negative
    denominator =  cm_CatBoost_model[1][1] + cm_CatBoost_model[0][1] + cm_CatBoost_model[1][0] + cm_CatBoost_model[0][0] # All
    accuracy = (numerator/denominator) * 100
    #
    numerator = cm_CatBoost_model[1][1] #True Posotive
    denominator =  cm_CatBoost_model[1][1] +  cm_CatBoost_model[0][1] # True Positive + False Positive
    prec = (numerator/denominator) * 100
    #
    numerator = cm_CatBoost_model[1][1] #True Posotive
    denominator =  cm_CatBoost_model[1][1] +  cm_CatBoost_model[1][0] # True Positive + False Negative
    rec = (numerator/denominator) * 100
    #
    F_value = 2*prec*rec/(prec+rec) # we can add a little bit of magic right here and the best n and k would change
    F_value_list.append(F_value)
    #
    print (n,k,accuracy,F_value, prec,rec)
    
best_n = n_list[F_value_list.index(max(F_value_list))]
best_k = k_list[F_value_list.index(max(F_value_list))]
best_conf_mat = conf_mat[F_value_list.index(max(F_value_list))]
print (best_n, best_k)
print (best_conf_mat)

30 70 86.617872508778 77.53238126868149 72.90755777638975 82.78368794326241
35 65 87.47836407694972 78.08930425752855 76.26774847870182 80.0
40 60 87.64156075367192 77.66955589312842 78.29219960367502 77.05673758865248
45 55 87.7355224766332 77.01149425287355 80.69153069153069 73.65248226950355
50 50 87.84432026111469 76.41527537900595 83.27059807611877 70.60283687943263
55 45 87.27065921566688 74.38805970149255 84.76190476190476 66.27659574468086
35 65
[[13177  1404]
 [ 1128  4512]]


Overall, all models showed good result (accuracy ≈0.86-0.88)