In [94]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
import json
import re
import nltk
import pickle
from nltk.corpus import stopwords
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
import operator
import nltk.data
from sklearn.model_selection import train_test_split
import math
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
%matplotlib inline

Open our data set

In [2]:
with open("data_raw.json") as f:
    data = json.loads(f.read())

New data set. Only negative samples

In [3]:
with open("negative.json") as n:
    negative_data = json.loads(n.read())

In [4]:
print(len(data))
print(len(negative_data))

160955
4647


In [5]:
data.extend(negative_data)

# 1 Cleaning data

From PyMystem import russian stopwords

In [117]:
m = Mystem()
stopWords = set(stopwords.words("russian"))

In [118]:
badwords = [
u'я', u'а', u'да', u'но', u'тебе', u'мне', u'ты', u'и', u'у', u'на', u'ща', u'ага',
u'так', u'там', u'какие', u'который', u'какая', u'туда', u'давай', u'короче', u'кажется', u'вообще',
u'ну', u'не', u'чет', u'неа', u'свои', u'наше', u'хотя', u'такое', u'например', u'кароч', u'как-то',
u'нам', u'хм', u'всем', u'нет', u'да', u'оно', u'своем', u'про', u'вы', u'м', u'тд',
u'вся', u'кто-то', u'что-то', u'вам', u'это', u'эта', u'эти', u'этот', u'прям', u'либо', u'как', u'мы',
u'просто', u'блин', u'очень', u'самые', u'твоем', u'ваша', u'кстати', u'вроде', u'типа', u'пока', u'ок',u'в'
,u'б',u'г',u'д',u'е',u'ж',u'з',u'й',u'к',u'л',u'ф',u'н',u'о',u'п',u'р',u'с',u'т',u'ч',u'ц',u'ч',u'ш',u'щ',u'ь'
,u'ъ',u'ы',u'э','ю']

In [119]:
for word in badwords:
    stopWords.add(word)

Text lemmatization

In [120]:
def lemma(data):
    lemmas = m.lemmatize(data)
    return ''.join(lemmas)

Remove from text stop words but we don't remove negations "не" and "ни" for future concatination it

In [121]:
def stop_remove(data, negation=False):
    for word in stopWords:
        if((word == u'не' or word == u"ни") and negation == False):
            continue
        if (word in data):
            for item in range(data.count(word)):
                data.remove(word)
    return data            

Cleaning function

In [122]:
def cleaning(data):
    cleaned_data = re.sub("[^а-яА-ЯЁё]"," ", data) # leave only russian text
    cleaned_data = lemma(cleaned_data)
    cleaned_data = cleaned_data.lower().split()
    cleaned_data = stop_remove(cleaned_data)
    return cleaned_data

Concatinate negations with words like "не хотеть" to "нехотеть"

In [123]:
def concat(data):
    check = False
    for i in range (0, len(data)-1):
        if(data[i] == u'не' or data[i] == u'ни'):
            data[i+1]=(data[i]+data[i+1])
    return data        

Count our data set

In [18]:
neutral = 0
positive = 0
negative = 0

for item in data:
    if item['manual_sentiment'] == 'neutral':
        neutral+=1
    elif item['manual_sentiment'] == 'positive':
        positive+=1
    else:
        negative+=1
        
print("Neutral", neutral)
print("Positive", positive)
print("Negative", negative)
print("All sentiment", neutral+positive+negative)

Neutral 87341
Positive 60761
Negative 17500
All sentiment 165602


Cleansing 

In [19]:
for i in range (0,len(data)):
    data[i]["text"] = cleaning(data[i]["text"])
    data[i]["text"] = concat(data[i]["text"])
    data[i]["text"] = stop_remove(data[i]["text"],True)

Remove duplicated posts. The reason that I remove all duplicates after cleaning is some cases when one text is the same like another but with few stopwords added. So I decided to clean and then remove duplicates. After this step I solved this problem with duplicates.

In [20]:
# deleting duplicates 
new_list = list()
cnt = 0
for i in data:
    if (i not in new_list):
        new_list.append(i)
    else:
        cnt+=1
print ("Duplicates",cnt)

#deleting duplicates with the same text but with different sentiment
new_list2 = list()
cnt = 0
for i in new_list:
    check = False
    for j in new_list:
        if (i["text"] == j["text"]) and ( i["manual_sentiment"] != j["manual_sentiment"] ):
            check = True
            cnt+=1
    if check == False:
        new_list2.append(i)
print("Duplicates with different sentiment",cnt)
new_list.clear()

Duplicates 47815
Duplicates with different sentiment 20668


In [21]:
# divide our data into text and sentiment (where positive = 1, neutral = 0, negative = -1)
text = []
sentiment = []
neutral = 0
positive = 0
negative = 0

for item in new_list2:
    text.append(item['text'])
    if item['manual_sentiment'] == 'neutral':
        sentiment.append(0)
        neutral+=1
    elif item['manual_sentiment'] == 'positive':
        sentiment.append(1)
        positive+=1
    else:
        sentiment.append(-1)
        negative+=1
        
print("Neutral", neutral)
print("Positive", positive)
print("Negative", negative)
print("All sentiment", neutral+positive+negative)

Neutral 54348
Positive 37136
Negative 10583
All sentiment 102067


Upload our cleaned data to json file.

In [23]:
with open ("data_prepared.json", "w", encoding='utf-8') as outfile:
    outfile.write('[')
    for i in range(0, len(new_list2)):
        json.dump(new_list2[i], outfile, ensure_ascii=False)
        if i+1 < len(new_list2):
            outfile.write(',')
        outfile.write('\n')    
    outfile.write(']')

# 2 Vectorization. 2.1 Bag of words model. 

In [20]:
word_index = dict()
#check appearence of the text then build dictionary
for sentence in new_list2:
    for word in sentence["text"]:
        if word not in word_index:
            word_index[word] = 0
        else:
            word_index[word]+=1
#sort our dictionary in descending order by its appearence             
sorted_words = sorted(word_index.items(), key=operator.itemgetter(1), reverse=True)        

#build matrices with 5000 features(dimensions)
matrices = np.zeros(shape=(len(new_list2),5000))
vocabulary = dict()

for i in range(5000):
    vocabulary[sorted_words[i][0]] = i

cnt = 0
i = 0

for sentence in new_list2:
    for word in sentence["text"]:
        if word in vocabulary:
            if matrices[i,vocabulary[word]] == 0:
                matrices[i,vocabulary[word]] = 1
            else:
                matrices[i,vocabulary[word]]+=1
        cnt+=1
    i+=1   

In [97]:
len(matrices)

102069

In [98]:
result = list()
sentiment = 0
for i in range(len(matrices)):
    if new_list2[i]["manual_sentiment"] == "neutral":
        sentiment = 0
    elif new_list2[i]["manual_sentiment"] == "negative":
        sentiment = -1
    else:
        sentiment = 1  
    result.append((matrices[i],sentiment))

In [99]:
print(result[0])

(array([ 0.,  5.,  0., ...,  0.,  0.,  0.]), 0)


In [22]:
with open('matrices.pkl', 'wb') as f:
    pickle.dump(result,f)

# 2 Vectorization. 2.2 Word2Vec algorithm

In [54]:
text = []
sentiment = []
neutral = 0
positive = 0
for item in new_list2:
    if item["manual_sentiment"] == "neutral" and neutral < 30000:
        text.append(item["text"])
        sentiment.append(0)
        neutral+=1
    elif item["manual_sentiment"] == "positive" and positive < 25000:
        text.append(item["text"])
        sentiment.append(1)
        positive+=1
    elif item["manual_sentiment"] == "negative":
        text.append(item["text"])
        sentiment.append(-1)

In [55]:
num_features = 50
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

from gensim.models import word2vec
model = word2vec.Word2Vec(text,workers=num_workers, size=num_features,\
                          min_count = min_word_count,window=context, sample=downsampling)

In [56]:
model.init_sims(replace = True)

In [57]:
model_name = "SVC_word2vec"

In [58]:
model.save(model_name)

Word2Vec shows quite good results with word similarities

In [59]:
model.most_similar('астана')

[('алматы', 0.7707394361495972),
 ('астан', 0.7488085031509399),
 ('столица', 0.7461706399917603),
 ('столичный', 0.7252397537231445),
 ('экспо', 0.7173423171043396),
 ('универсиада', 0.6010549068450928),
 ('специализированный', 0.597379207611084),
 ('выставка', 0.5972445607185364),
 ('авиарейс', 0.5960143208503723),
 ('город', 0.5849112272262573)]

Function to average all of the word vectors in a given paragraph

In [60]:
def makeFeatureVec(words,model,num_features):
    featureVec = np.zeros((num_features,), dtype='float32')
    n = 0
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            n+=1
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,n)
    return featureVec

Function that calculates the average feature vector for each one and return a 2D numpy array

In [61]:
def getAverageVec(posts, model,num_features):
    cnt = 0
    reviewFeatureVecs = np.zeros((len(posts), num_features), dtype='float32')
    
    for sentence in posts:
        reviewFeatureVecs[cnt] = makeFeatureVec(sentence,model,num_features)
        cnt+=1
    return reviewFeatureVecs    

In [62]:
DataVecs = getAverageVec(text,model,num_features)

  # Remove the CWD from sys.path while we load stuff.


Check for missing values, because it can greatly affect to our accuracy

In [63]:
np.isnan(DataVecs).any()

True

In [64]:
np.count_nonzero(np.isnan(DataVecs))

500

Change empty values to numbers

In [65]:
DataVecs = np.nan_to_num(DataVecs)

In [66]:
np.isnan(DataVecs).any()

False

# 3 Building and Investigating models

## 3.1 Support vector classifier with bag of words model

The reason that I limit my training set was huge amount of neutral texts. Then I think it "overfitted" on neutral examples, that decreased my accuracy

In [135]:
X = []
y = []
neutral = 0
positive = 0
for i in range (len(result)):
    if(result[i][1] == "neutral" and neutral < 30000):
        X.append(result[i][0])
        y.append(result[i][1])
    elif(result[i][1] == "positive" and positive < 25000):
        X.append(result[i][0])
        y.append(result[i][1])
    elif result[i][1] == "negative":
        X.append(result[i][0])
        y.append(result[i][1])    

NameError: name 'result' is not defined

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state=42)

In [None]:
SVC_BoW = SVC(decision_function_shape = "ovr", kernel='linear', gamma=0.1, C=1)

In [None]:
SVC_BoW.fit(X_train,y_train)

In [None]:
predicted = SVC_BoW.predict(X_test)
print('Accuracy:', SVC_BoW.score(X_test, y_test))
print('Metrics:', metrics.classification_report(predicted,y_test))

#### This model illustrated us pretty good model, but we need robust and complex model

## 3.2 Support vector classifier with Word2Vec algorithms

In [67]:
len(DataVecs)

65583

In [68]:
#for testing
X_train, X_test, y_train, y_test = train_test_split(DataVecs, sentiment, test_size = 0.2, random_state = 42)
SVC_W2V = SVC(decision_function_shape = "ovr", kernel='linear', gamma=0.1, C=1)
SVC_W2V.fit(X_train,y_train)

Pretty good results, before we had 30% of precision value on negative sets.

In [71]:
predicted = SVC_W2V.predict(X_test)
print("Accuracy", SVC_W2V.score(X_test,y_test))
print("Metrics", metrics.classification_report(predicted,y_test))

Accuracy 0.691621559808
Metrics              precision    recall  f1-score   support

         -1       0.51      0.71      0.60      1544
          0       0.71      0.67      0.69      6332
          1       0.75      0.71      0.73      5241

avg / total       0.70      0.69      0.69     13117



Let's brute force our parameters and train them on whole training set

In [109]:
# for training on 100% samples then we upload new testing set on 4th step
param_grid = [
  {'C': [1, 10, 100], 'kernel': ['linear']},
  {'C': [1, 10, 100], 'gamma': [0.001, 0.01,0.1], 'kernel': ['rbf']},
]
SVC_W2V = GridSearchCV(SVC(), param_grid, cv = 5)
SVC_W2V.fit(DataVecs,sentiment)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [1, 10, 100], 'kernel': ['linear']}, {'C': [1, 10, 100], 'gamma': [0.001, 0.01, 0.1], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [110]:
print(SVC_W2V.best_estimator_)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [132]:
SVC_W2V_best = SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [133]:
SVC_W2V_best.fit(DataVecs,sentiment)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### We  see here much better results, by increasing precision value of negative samples from 30% to 51 %, even this is just randomly choosen parameters.

# 4 Uploading new testing set and make the same cleaning process

### to be sure that our model is robust we uploaded new testing data

In [114]:
with open("data_test.json") as f:
    test = json.loads(f.read())

In [115]:
positive = 0
negative = 0
neutral = 0
for item in test:
    if item["manual_sentiment"] == "neutral":
        neutral+=1
    if item["manual_sentiment"] == "positive":
        positive+=1
    if item["manual_sentiment"] == "negative":
        negative+=1 
print("Negative", negative)
print("Positive", positive)
print("Neutral", neutral)
print("Total", negative+positive+neutral)

Negative 1093
Positive 2580
Neutral 9068
Total 12741


In [124]:
for i in range (0,len(test)):
    test[i]["text"] = cleaning(test[i]["text"])
    test[i]["text"] = concat(test[i]["text"])
    test[i]["text"] = stop_remove(test[i]["text"],True)

In [125]:
# deleting duplicates 
test_list = list()
cnt = 0
for i in test:
    if (i not in test_list):
        test_list.append(i)
    else:
        cnt+=1
print ("Duplicates",cnt)

#deleting duplicates with the same text but with different sentiment
test_list2 = list()
cnt = 0
for i in test_list:
    check = False
    for j in test_list:
        if (i["text"] == j["text"]) and ( i["manual_sentiment"] != j["manual_sentiment"] ):
            check = True
            cnt+=1
    if check == False:
        test_list2.append(i)
print("Duplicates with different sentiment",cnt)
test_list.clear()

Duplicates 4623
Duplicates with different sentiment 1416


In [126]:
# divide our data into text and sentiment (where positive = 1, neutral = 0, negative = -1)
test_text = []
test_sentiment = []
neutral = 0
positive = 0
negative = 0

for item in test_list2:
    test_text.append(item['text'])
    if item['manual_sentiment'] == 'neutral':
        test_sentiment.append(0)
        neutral+=1
    elif item['manual_sentiment'] == 'positive':
        test_sentiment.append(1)
        positive+=1
    else:
        test_sentiment.append(-1)
        negative+=1
        
print("Neutral", neutral)
print("Positive", positive)
print("Negative", negative)
print("All sentiment", neutral+positive+negative)

Neutral 5052
Positive 1179
Negative 513
All sentiment 6744


In [127]:
TestDataVecs = getAverageVec(test_text,model,num_features)

In [134]:
predicted = SVC_W2V_best.predict(TestDataVecs)
print("Accuracy", SVC_W2V.score(TestDataVecs,test_sentiment))
print("Metrics", metrics.classification_report(predicted,test_sentiment))

Accuracy 0.68371886121
Metrics              precision    recall  f1-score   support

         -1       0.53      0.27      0.36       999
          0       0.70      0.86      0.77      4109
          1       0.68      0.49      0.57      1636

avg / total       0.67      0.68      0.66      6744



#### In conclusion, we see that support vector machine with word2vec model works fine, even there not so many negative samples. I need to find more training samples, especially negative samples.