# Libraries

In [2]:
import json
import re
import operator

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pymystem3 import Mystem
from tqdm import tqdm
from joblib import dump, load
from sklearn import metrics
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import pickle

tqdm.pandas()

# Helper functions

In [3]:
def makeFeatureVec(words,model,num_features):
    featureVec = np.zeros((num_features,), dtype='float32')
    n = 0
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            n+=1
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,n)
    return featureVec

def getAverageVec(posts, model,num_features):
    cnt = 0
    reviewFeatureVecs = np.zeros((len(posts), num_features), dtype='float32')
    
    for sentence in posts:
        reviewFeatureVecs[cnt] = makeFeatureVec(sentence,model,num_features)
        cnt+=1
    return reviewFeatureVecs

# Data loading

In [3]:
df = pd.read_csv('clean_text.csv')

In [4]:
df.head(2)

Unnamed: 0,text,label,cleaned_text
0,"Когда-то я был добрым романтиком, который стре...",1,"['добрый', 'романтик', 'стремиться', 'помогать..."
1,Здраствуйте! Я каждый день просыпаюсь с мыслью...,1,"['здраствовать', 'каждый', 'просыпаться', 'мыс..."


In [5]:
df.shape

(83790, 3)

In [6]:
df.drop_duplicates('text', inplace=True)

In [7]:
df.shape

(81632, 3)

# Vectorization

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.cleaned_text, df.label, 
                                                    test_size = 0.2, random_state = 4)

## TF-IDF

In [9]:
tfidf_transformer = TfidfVectorizer(max_features=5000, min_df=10)
X_train_tfidf_model = tfidf_transformer.fit(X_train)
X_train_tfidf_vector = X_train_tfidf_model.transform(X_train)

In [10]:
dump(X_train_tfidf_model, 'tfidf.joblib')

['tfidf.joblib']

In [11]:
X_test_tfidf_vector = X_train_tfidf_model.transform(X_test)

In [12]:
with open("train_tfiidf.pkl", "wb") as f:
    pickle.dump(X_train_tfidf_vector,f)
    
with open("train_label.pkl", "wb") as f:
    pickle.dump(y_train,f)

In [13]:
with open("test_tfiidf.pkl", "wb") as f:
    pickle.dump(X_test_tfidf_vector,f)
    
with open("test_label.pkl", "wb") as f:
    pickle.dump(y_test,f)

## word2vec

In [14]:
num_features = 120
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

model = word2vec.Word2Vec(X_train,workers=num_workers, size=num_features,\
                          min_count = min_word_count,window=context, sample=downsampling)
model.init_sims(replace = True)
model_name = "word2vec"
model.save(model_name)

In [15]:
DataVecs_train = getAverageVec(X_train,model,num_features)
np.isnan(DataVecs_train).any()
np.count_nonzero(np.isnan(DataVecs_train))
DataVecs_train = np.nan_to_num(DataVecs_train)
np.isnan(DataVecs_train).any()

False

In [16]:
DataVecs_test = getAverageVec(X_test,model,num_features)
np.isnan(DataVecs_test).any()
np.count_nonzero(np.isnan(DataVecs_test))
DataVecs_test = np.nan_to_num(DataVecs_test)
np.isnan(DataVecs_test).any()

False

In [17]:
with open("train_word2vec.pkl", "wb") as f:
    pickle.dump(DataVecs_train,f)
    
with open("test_word2vec.pkl", "wb") as f:
    pickle.dump(DataVecs_test,f)