In [109]:
import pandas as pd
import numpy as np
import re
import gc

from spellchecker import SpellChecker

from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#### About Dataset

In [100]:
df = pd.read_csv(r'C:\Users\1NR_Operator_33\Desktop\imdb_dataset.csv')
df.sample(10)

Unnamed: 0,review,sentiment
19569,David Zucker has directed one of the most enjo...,positive
20986,Who would have thought that a movie about a ma...,positive
38382,... because this is yet another dead one. Life...,negative
36209,"Okay, I can sit through almost any movie, and ...",negative
1817,"In many ways, the filmic career of independent...",positive
33218,"Well, this film is a difficult one really. To ...",positive
8084,"Before Cujo,there was Lucky the devil dog. In ...",positive
2610,This is another enjoyable and entertaining Hit...,positive
18775,I remember watching this movie on TV a few yea...,negative
17534,"In a lot of his films (Citizen Kane, Confident...",positive


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [102]:
# Посмотрим кол-во классов
print(f'Размерность датасета: {df.shape}')
print(f'Кол-во классов: {len(df["sentiment"].unique())}')

Размерность датасета: (50000, 2)
Кол-во классов: 2


In [103]:
df['sentiment'] = df['sentiment'].map({'negative': 0,'positive': 1})  # Label prep
df = df.rename(columns={'review':'text', 'sentiment': 'label'})  
df

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [114]:
df = df.astype({'text': 'string', 'label': 'int8'})
df.dtypes

text     string[python]
label              int8
dtype: object

#### EDA

In [117]:
# Дисбаланс классов

Unnamed: 0,text,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


#### Text preprocessing

In [128]:
def symbols_remove(input_text: str) -> str:
    """Удаление всех символов, кроме слов.
       Функция запускается c apply для всего корпуса"""
    
    link_process = re.sub(r"http://\S+|https://\S+", "", input_text)  # Удаление ссылок

    garbage_process = re.sub(r'([^\s\w])', '', link_process)  # Удаление мусора (оставляем слова и цифры)

    word_process = re.sub(r'\w*\d+\w*', ' ', garbage_process)  # Оставляем только слова

    output_text = re.sub('\s+', ' ', word_process).strip().lower()  # Удаление пробелов, перевод регистра

    return re.sub(r'\n', '', output_text)

 
df['text'] = df['text'].apply(symbols_remove)      # RegEx preprocessing
gc.collect()

6780

In [129]:
spell = SpellChecker()                            # Для исправление орфографии 
stemmer = SnowballStemmer('english')              # Для стеммера
STOP_WORDS_ENGLISH = stopwords.words('english')   # Блок стоп-слов

def text_preprocessing(input_text: str) -> str:
   """Токенизация, удаление стоп слов, исправление орфографии,
      Стемминг, перевод из списка токенов в str (для этапа векторизации)
      Функция запускается с apply для всего корпуса"""
   
   text_tokens = word_tokenize(input_text)                                               # Токенизация
   
   text_without_sw = [word for word in text_tokens if word not in STOP_WORDS_ENGLISH]    # Удаление стоп-слов

   text_with_stemming = [stemmer.stem(word) for word in text_without_sw]                 # Стемминг

   output_text = ' '.join(text_with_stemming)
   
   return output_text



df['text'] = df['text'].apply(text_preprocessing)  # Tokenization, stop words, spelling, stemming, return to str
gc.collect()

0

In [131]:
df.sample(5)  # Successful preprocess

Unnamed: 0,text,label
28376,saw movi lowland festiv august friend mine sai...,1
39807,film open two thug kill anoth thug bodi discov...,1
15147,whatev name writer director lock away hope gar...,0
27871,film centuri italian artist one artist particu...,1
15895,hillari funni brook movi ever seen watch rewat...,1


#### Model