In [2]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline 

from nltk.stem.porter import PorterStemmer 
from nltk.corpus import stopwords

import matplotlib.pyplot as plt

import scipy

import warnings
warnings.filterwarnings("ignore")

In [3]:
def no_stop_words(text): 
    words = [porter.stem(word) for word in text.split()] 
    nostop = [word for word in words if word not in stop]
    return " ".join(nostop)

In [4]:
import spacy

# Загрузка русской модели
nlp = spacy.load("ru_core_news_sm")

text = "Машины едут быстрее, чем другие машины."
doc = nlp(text)
lemmatized_words = [token.lemma_ for token in doc]
print(" ".join(lemmatized_words))

машина ехать быстрый , чем другие машина .


In [None]:
def lemm(text:str):
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc]
    return " ".join(lemmatized_words)

In [None]:
stop = stopwords.words('russian')
porter = PorterStemmer() 

#### Датасет взят c kaggle https://www.kaggle.com/datasets/maximsuvorov/rutweetcorp
#### Содержит 200 тыс сообщений из твиттера на на русском языке

#### Импорт данных

In [5]:
df_negative = pd.read_csv('negative.csv')
df_positive = pd.read_csv('positive.csv')

In [None]:
print(df_negative.shape)
print(df_positive.shape)

In [None]:
df_negative.columns

In [None]:
df_negative

#### Объединим негативные и положительные сообщения

In [6]:
df = pd.concat([df_negative[['ttext','ttype']],df_positive[['ttext','ttype']]])
df = df.sample(frac=1)
df.index = range(1,df.shape[0]+1)
df['ttype'] = df.loc[:,'ttype'].apply(lambda t: 0 if t == -1 else 1)

In [None]:
df['ttype'].info()

In [None]:
#df['ttext'].to_csv('mydata.csv')

In [None]:
df['ttype'].value_counts()

In [7]:
def preprocess(text:str) -> str :
    #text = re.sub(r"http:\S*","",text)
    emoticons = re.findall(r"[XХ:=][3зЗD()]+", text)
    emoticons += re.findall(r"[0оОoO]_[0оОoO]", text)
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    text = re.sub(r"[XХ:][3ЗD()]+","", text)
    text = re.sub(r"[\n\r.,]"," ",text)
    text = re.sub(r"[():!;?\"|]*","",text)
    text = re.sub(r"[#@][\S]*","",text)
    text = re.sub(r"RT","",text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r" {1,}", " ",text).strip()
    return str.lower(text) +" "+" ".join(emoticons)

In [None]:
def emotions(text:str):
    emoticons = re.findall(r"[XХ:=][3зЗD()]+", text)
    emoticons += re.findall(r"[0оОoO]_[0оОoO]", text)
    return emoticons

#### На данный момент попытки добавить новые признаки, привело к тому что модель стала смотреть только на них, что ухудшило её качество. Займусь ими позже

In [21]:
arr = np.array(df['ttext'].iloc[0:100])

In [24]:
arr[23]

'аааа я еще не влезала в ленту ну ты просто более чем оригинален Х) :D'

In [8]:
df['exclamations'] = df['ttext'].apply(lambda l: len(re.findall(r"!",l)) > 0)
df['sad_bracket'] = df['ttext'].apply(lambda l: len(re.findall(r"\(",l)) > 0)
df['happy_bracket'] = df['ttext'].apply(lambda l: len(re.findall(r"\)",l))>0)
df['upper_symbols'] = df['ttext'].apply(lambda l: len(list(filter(str.isupper,l)))>0)

In [9]:
df['ttext'] = df['ttext'].apply(preprocess)
#df['ttext'] = df['ttext'].apply(no_stop_words)

In [None]:
from scipy.stats import uniform

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df[['ttext','exclamations','sad_bracket','happy_bracket','upper_symbols']], df['ttype'], test_size=0.33, random_state=42)

In [None]:
def random_search_log_reg(X_train,y_train):
    cv = TfidfVectorizer(ngram_range=(1,2))
    sparse_train = cv.fit_transform(X_train['ttext'])
    #sparse_tr = scipy.sparse.csr_matrix(X_train[['exclamations','sad_bracket','happy_bracket','upper_symbols']])
    #sparse_train = scipy.sparse.hstack([sparse_train,sparse_tr])
    
    param_grid_logit = {"C":[0.1,0.001,10,100,500],'penalty':['l1','l2']}
    logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,random_state=0)
    distributions = dict(C=uniform(loc=0, scale=4),penalty=['l2', 'l1'])
    clf = RandomizedSearchCV(logistic, distributions, random_state=0,n_jobs=-1)
    clf.fit(sparse_train,y_train)
    return clf,cv

In [11]:
def log_reg(X_train,y_train):
    cv = TfidfVectorizer(ngram_range=(1,2))
    sparse_train = cv.fit_transform(X_train['ttext'])
    #sparse_tr = scipy.sparse.csr_matrix(X_train[['exclamations','sad_bracket','happy_bracket','upper_symbols']])
    #sparse_train = scipy.sparse.hstack([sparse_train,sparse_tr])
    lg = LogisticRegression(solver='saga',random_state=0,n_jobs=-1)
    lg.fit(sparse_train,y_train)
    return lg,cv

In [12]:
clf,cv = log_reg(X_train,y_train)

In [17]:
def roc_auc_test(X_test,y_test,model,cv):
    data_clean = prepare_data(X_test,cv)
    prediction = model.predict_proba(data_clean)[:,1]
    return roc_auc_score(y_test,prediction)

In [18]:
roc_auc_test(X_test,y_test,clf,cv)

0.8380411410635925

In [None]:
words_importance(clf,cv)

In [None]:
clf,cv = random_search_log_reg(X_train,y_train)

In [None]:
print("best params",clf.best_params_)
print("best score",clf.best_score_)
md = clf.best_estimator_

In [16]:
def prepare_data(df,cv):
    #df['exclamations'] = df['ttext'].apply(lambda l: len(re.findall(r"!",l)) > 0)
    #df['sad_bracket'] = df['ttext'].apply(lambda l: len(re.findall(r"\(",l)) > 0)
    #df['happy_bracket'] = df['ttext'].apply(lambda l: len(re.findall(r"\)",l))>0)
    #df['upper_symbols'] = df['ttext'].apply(lambda l: len(list(filter(str.isupper,l)))>0)
   # df['ttext'] = df['ttext'].apply(preprocess)
    sparse_train = cv.transform(df['ttext'])
    #sparse_tr = scipy.sparse.csr_matrix(df[['exclamations','sad_bracket','happy_bracket','upper_symbols']])
    #sparse_train = scipy.sparse.hstack([sparse_train,sparse_tr])
    return sparse_train

In [None]:
grid_search = GridSearchCV(LogisticRegression(n_jobs=-1),param_grid=param_grid_logit,cv=10)

In [None]:
grid_search.fit(sparse_train,df['ttype'])

In [None]:
len(*clf.coef_)

In [None]:
print(clf.coef_[0][-2])

In [None]:
len(cv.get_feature_names_out())

In [19]:
def words_importance(model,cv):
    words = cv.get_feature_names_out()
    coefs = model.coef_[0]
    z = list(zip(words,coefs))
    z = sorted(z,key=lambda l:l[1],reverse=True)
    for el in z[:20]:
        print(el)
    print("----------------")
    for el in z[-20:-1]:
        print(el)
    return z

In [None]:
w_i = words_importance(lg,cv)

In [None]:
w_i

In [None]:
most_positive = z[:100]
most_negative = z[-100:-1]

In [None]:
from wordcloud import WordCloud

In [None]:
wordcloud_positive = WordCloud(background_color="black",
                               colormap = 'Blues',
                               max_words=200,
                               mask=None, 
                               width=1600,
                               height=1600).generate_from_frequencies(dict(most_positive))

wordcloud_negative = WordCloud(background_color="black",
                               colormap = 'Oranges',
                               max_words=200,
                               mask=None, 
                               width=1600,
                               height=1600).generate_from_frequencies(dict(most_negative))

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (20, 12))


ax[0].imshow(wordcloud_positive, interpolation='bilinear')
ax[1].imshow(wordcloud_negative, interpolation='bilinear')

ax[0].set_title('Positive',
               fontsize = 20
               )
ax[1].set_title('Negative',
               fontsize = 20
               )

ax[0].axis("off")
ax[1].axis("off")

plt.show()

#### В сообщениях было много грамматических ошибок. На данный момент видно, что признаки были отобраны не совсем удачно. Лемматизацию и стэмминг пока что не применял, со стэммингом вроде еще хуже

#### Сохраним параметры нашей модели. Хочу сделать небольшой прод в виде тг бота

In [None]:
import pickle

In [None]:
with open('model2.pkl', 'wb') as file:
    pickle.dump(lg, file)

In [None]:
with open('tfidf_vectorizer2.pkl', 'wb') as file:
    pickle.dump(cv, file)

In [None]:
nb.fit(sparse_train,df['ttype'])

In [None]:
nb.pr