In [37]:
import pandas as pd
import numpy as np
import re
import gc
import warnings

from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category = ConvergenceWarning)

#### About Dataset

In [2]:
df = pd.read_csv(r'C:\Users\1NR_Operator_33\Desktop\imdb_dataset.csv')
df.sample(10)

Unnamed: 0,review,sentiment
29856,I saw the film in its original theatrical rele...,positive
3583,This film resembles in many ways `Enemy of the...,positive
23541,Totally forgettable movie but an unbelievable ...,negative
43628,"One of the best ""Amitabh comeback"" movies I li...",positive
19740,"I have seen films come and go in my years,and ...",positive
47578,"Sure, this movie is sappy and sweet and full o...",positive
890,Salva and his pal Bigardo have been at the mar...,positive
17929,I got this for my birthday in a box set under ...,negative
20763,I guess my husband and I are a little slow. We...,positive
2012,"This is a pleasant film, even if the premise i...",positive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
# Посмотрим кол-во классов
print(f'Размерность датасета: {df.shape}')
print(f'Кол-во классов: {len(df["sentiment"].unique())}')

Размерность датасета: (50000, 2)
Кол-во классов: 2


In [5]:
df['sentiment'] = df['sentiment'].map({'negative': 0,'positive': 1})  # Label prep
df = df.rename(columns={'review':'text', 'sentiment': 'label'})  
df

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [6]:
df = df.astype({'text': 'string', 'label': 'int8'})
df.dtypes

text     string[python]
label              int8
dtype: object

#### EDA

In [117]:
# Дисбаланс классов

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [None]:
# Гистограмма распределения размера текста

#### Text preprocessing

In [7]:
def symbols_remove(input_text: str) -> str:
    """Удаление всех символов, кроме слов.
       Функция запускается c apply для всего корпуса"""
    
    link_process = re.sub(r"http://\S+|https://\S+", "", input_text)  # Удаление ссылок

    garbage_process = re.sub(r'([^\s\w])', '', link_process)  # Удаление мусора (оставляем слова и цифры)

    word_process = re.sub(r'\w*\d+\w*', ' ', garbage_process)  # Оставляем только слова

    output_text = re.sub('\s+', ' ', word_process).strip().lower()  # Удаление пробелов, перевод регистра

    return re.sub(r'\n', '', output_text)

In [8]:
stemmer = SnowballStemmer('english')              # Для стеммера
STOP_WORDS_ENGLISH = stopwords.words('english')   # Блок стоп-слов

def text_preprocessing(input_text: str) -> str:
   """Токенизация, удаление стоп слов, исправление орфографии,
      Стемминг, перевод из списка токенов в str (для этапа векторизации)
      Функция запускается с apply для всего корпуса"""
   
   text_tokens = word_tokenize(input_text)                                               # Токенизация
   
   text_without_sw = [word for word in text_tokens if word not in STOP_WORDS_ENGLISH]    # Удаление стоп-слов

   text_with_stemming = [stemmer.stem(word) for word in text_without_sw]                 # Стемминг

   output_text = ' '.join(text_with_stemming)
   
   return output_text

In [9]:
df['text'] = df['text'].apply(symbols_remove)      # RegEx preprocessing
df['text'] = df['text'].apply(text_preprocessing)  # Tokenization, stop words, spelling, stemming, return to str
gc.collect()

Unnamed: 0,text,label
48936,inspir least littl ivi benson girl orchestra p...,1
25373,hold bar cruel offens humor sure enough offend...,0
39850,excel perform still good actor around also gre...,1
14577,deepa tri brave bring subject one want talk st...,0
45135,video fantast testament insight work bill hick...,1


In [10]:
df.sample(10)  # Let's look at result

Unnamed: 0,text,label
28357,intellig humor excel perform cant believ peopl...,1
1580,french either make promarxist film antimarxist...,0
5099,saw movi comedi central time movi pretti good ...,1
30566,imagin clich ridden bmovi horror plot add plot...,0
8392,your one peopl doesnt realli like scifi someti...,0
18013,pleas wast six hour life watch fact good refle...,0
26680,oh yes agre other describ appal act four hour ...,0
38096,wonder bbc televis product movi other written ...,1
43821,prais _atlantis_the_lost_empire_ disney advent...,0
13028,realli good flick awesom humor jim verney know...,1


In [16]:
# Train & test split
RANDOM_STATE = 50
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'],
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=RANDOM_STATE)

#### Model

In [48]:
text_pipeline = Pipeline([
                          ('vectorizer', TfidfVectorizer()), 
                          ('lr', LogisticRegression())
                         ])

In [49]:
# Grid parametrs
parametrs = {
              'vectorizer__ngram_range': [(1, 1), (1, 2)],
              'lr__C': [5, 15, 50, 100],
            }

# CrossValidation into GridSearch
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Create grid search (+ CrossValidation in GridSearch)
grid_search = GridSearchCV(text_pipeline,
                           parametrs,
                           scoring='f1',
                           cv=kf,
                           verbose=1)

# Start
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
print(f'GRID SEARCH BEST SCORE: {grid_search.best_score}')
print(f'GRID SEARCH BEST PARAMS: {grid_search.best_params_}')

In [None]:
# Lets look the best Pipeline
best_pipeline = grid_search.best_estimator_

# Fit again for best pipeline 
best_pipeline.fit(X_train, y_train)

In [None]:
# Best model test in X_test
best_pipeline.score(X_test, y_test)

In [None]:
# Error matrix