In [1]:
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from numpy import mean
import numpy as np 
import pandas as pd
import re

In [2]:
train_sentiment  = pd.read_csv('D:\Datasets\Sentiment Analysis//train.tsv', sep="\t") 

Для каждой фразы в наборе тестов спрогнозируйте метку для настроения. <br>
0 - negative <br>
1 - somewhat negative <br>
2 - neutral <br>
3 - somewhat positive <br>
4 - positive <br>
classification: accuracy = 0.6

## Изучение данных

In [3]:
train_sentiment.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
train_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [5]:
train_sentiment.Sentiment.value_counts(normalize=True) * 100

2    50.994489
3    21.098936
1    17.475971
4     5.899013
0     4.531590
Name: Sentiment, dtype: float64

## Подготовка данных

In [6]:
train_sentiment = train_sentiment.sample(frac=1) # перемешиваем данные

In [7]:
train_sentiment.shape

(156060, 4)

In [8]:
train_sentiment.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
116572,116573,6216,worth the trip to the theatre,4
144514,144515,7846,the CleanFlicks version,2
40268,40269,1929,With a large cast representing a broad cross-s...,2
33742,33743,1586,wrapped up in his own idiosyncratic strain of ...,2
108353,108354,5733,Bow Wow fans,2


In [9]:
def process_message(message, lower_case = True, stem = True, stop_words = True):
    if lower_case:
        message = message.lower()
    words = word_tokenize(message)
    if stop_words:
        sw = stopwords.words('english')
        words = [word for word in words if word not in sw]
    if stem:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]   
    return words

train_sentiment['Phrase'] = train_sentiment['Phrase'].apply(lambda x: ' '.join(process_message(x)))

In [10]:
# после обработки остались пустые строки, лучше удалить
train_sentiment['Phrase'].replace('', np.nan, inplace=True)
train_sentiment.dropna(subset = ['Phrase'], inplace=True)

In [11]:
train_sentiment.shape

(155250, 4)

In [12]:
train_sentiment.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
116572,116573,6216,worth trip theatr,4
144514,144515,7846,cleanflick version,2
40268,40269,1929,larg cast repres broad cross-sect,2
33742,33743,1586,wrap idiosyncrat strain kitschi goodwil,2
108353,108354,5733,bow wow fan,2


In [13]:
X =  train_sentiment.Phrase
y = train_sentiment.Sentiment

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

## Кодировка данных

In [15]:
TV = TfidfVectorizer(ngram_range = (1,2), max_df = 0.95, min_df = 10,sublinear_tf = True) #  преобразование текстовых данных в числовую форму
TV_train = TV.fit_transform(X_train)
TV_test = TV.transform(X_test)

In [16]:
y_train = LabelEncoder().fit_transform(y_train)

In [17]:
smt = SMOTE() # используем, потому что дисбаланс классов
TV_train, y_train = smt.fit_resample(TV_train, y_train)

In [18]:
cv = KFold(n_splits=10, random_state=1, shuffle=True)

## Обучение модели

In [19]:
LR = LogisticRegression(penalty = 'l2', solver = 'sag').fit(TV_train, y_train)
predictions_LR = LR.predict(TV_test)

In [20]:
scores_LR = accuracy_score(y_test, predictions_LR)
scores_LR_cross = cross_val_score(LR,  TV_test, y_test, scoring = 'accuracy', cv = cv, n_jobs = -1)

print('Accuracy до кросс-валидации = {scores_LR}, после = {scores_LR_cross}.'.format(scores_LR = round(scores_LR, 2), scores_LR_cross = round(mean(scores_LR_cross),2)))

Accuracy до кросс-валидации = 0.52, после = 0.6.
