In [31]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import markovify

In [2]:
stopwords_rus = stopwords.words('russian')

In [3]:
positive = pd.read_csv('positive.csv', sep=';', usecols=[3], names=['text'])
positive['label'] = ['positive'] * len(positive)
negative = pd.read_csv('negative.csv', sep=';', usecols=[3], names=['text'])
negative['label'] = ['negative'] * len(negative)
df = positive.append(negative)

In [4]:
df['text'].apply(lambda x: len(x.split())).mean()

12.26488092613982

In [5]:
df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
negative,111923
positive,114911


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.label)

In [16]:
%%time

pp = Pipeline([('vectorizer', TfidfVectorizer(ngram_range=(1, 2), tokenizer=TweetTokenizer().tokenize, stop_words=stopwords_rus)), 
               ('classifier', 
                LogisticRegression()
               )
              ])
pp.fit(X_train, y_train)



Wall time: 13.4 s


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['и', 'в', 'во', 'не', 'что', 'он',
                                             'на', 'я', 'с', 'со', 'как'...
                                 tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x0000012B11A471D0>>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual

In [27]:
print(classification_report(pp.predict(X_train), y_train))
print(classification_report(pp.predict(X_test), y_test))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     83894
    positive       1.00      1.00      1.00     86231

    accuracy                           1.00    170125
   macro avg       1.00      1.00      1.00    170125
weighted avg       1.00      1.00      1.00    170125

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     27938
    positive       1.00      1.00      1.00     28771

    accuracy                           1.00     56709
   macro avg       1.00      1.00      1.00     56709
weighted avg       1.00      1.00      1.00     56709



In [30]:
pp.predict_proba(['asdf', 'asdf'])

array([[0.52999439, 0.47000561],
       [0.52999439, 0.47000561]])

In [165]:
all_tweets = df['text'].apply(lambda x: ' '.join([i for i in TweetTokenizer().tokenize(x.lower()) 
                                                  if ('http' not in i) and ('@' not in i) and (i not in ['rt', ':'])]))
all_tweets = '\n'.join(list(all_tweets))

In [166]:
all_model = markovify.NewlineText(all_tweets)

In [167]:
%%time

generated_all = []
for i in range(10000):
    generated_all.append(all_model.make_sentence())

Wall time: 27 s


In [168]:
positive_tweets = df[df['label'] == 'positive']['text'].apply(lambda x: ' '.join([i for i in TweetTokenizer().tokenize(x.lower()) 
                                                  if ('http' not in i) and ('@' not in i) and (i not in ['rt', ':'])]))
positive_tweets = '\n'.join(list(positive_tweets))

In [169]:
len(all_tweets)

16473696

In [170]:
positive_model = markovify.NewlineText(positive_tweets)

In [171]:
%%time

generated_positive = []
for i in range(10000):
    generated_positive.append(positive_model.make_sentence())

Wall time: 21.2 s


In [172]:
negative_tweets = df[df['label'] == 'negative']['text'].apply(lambda x: ' '.join([i for i in TweetTokenizer().tokenize(x.lower()) 
                                                  if ('http' not in i) and ('@' not in i) and (i not in ['rt', ':'])]))
negative_tweets = '\n'.join(list(negative_tweets))

In [173]:
negative_model = markovify.NewlineText(negative_tweets)

In [174]:
%%time

generated_negative = []
for i in range(10000):
    generated_negative.append(negative_model.make_sentence())

Wall time: 6.94 s


In [242]:
negative_model.make_sentence()

'пятница 13 , а слог думает иначе'