In [44]:
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

In [45]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DoniZefironi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DoniZefironi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
df = pd.read_csv("IMDB Dataset.csv")

df = df[:20000]

In [47]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     20000 non-null  object
 1   sentiment  20000 non-null  object
dtypes: object(2)
memory usage: 312.6+ KB


In [49]:
df.describe()

Unnamed: 0,review,sentiment
count,20000,20000
unique,19926,2
top,Loved today's show!!! It was a variety and not...,negative
freq,4,10097


In [50]:
df["sentiment"].value_counts(normalize=True)

sentiment
negative    0.50485
positive    0.49515
Name: proportion, dtype: float64

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text: str) -> str:

    word = [lemmatizer.lemmatize(i) for i in text.lower().split()]

    return ' '.join(word)

In [None]:
stemmer = PorterStemmer()

def stemmer_text(text: str) -> str:

    word = [stemmer.stem(i) for i in text.lower().split()]

    return ' '.join(word)

In [52]:
stop_words = stopwords.words("english")

In [53]:
df["sentiment"].replace("positive", 1).replace("negative", 0)

  df["sentiment"].replace("positive", 1).replace("negative", 0)


0        1
1        1
2        1
3        0
4        1
        ..
19995    0
19996    0
19997    1
19998    1
19999    0
Name: sentiment, Length: 20000, dtype: int64

In [54]:
X, y = df["review"], df["sentiment"]

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

In [99]:
dummy = DummyClassifier()

pipe_dummy = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ("dummy_class", DummyClassifier(strategy = "stratified"))
])

pipe_dummy.fit(X_train, y_train)
dummy_pred = pipe_dummy.predict(X_test)

acc_dummy = accuracy_score(y_test, dummy_pred)
acc_dummy

0.4945

In [87]:
custom_stopwords = set(stopwords.words("english"))

important_words = {"not", "nor", "no", "but", "very", "too"}
custom_stopwords = custom_stopwords - important_words

custom_stopwords.update(["film", "show", "movie"])
custom_stopwords = list(custom_stopwords)

In [None]:
pipe_standart = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ("logistic_regress", LogisticRegression(max_iter=1000, C = 1))
])

pipe_standart.fit(X_train, y_train)
pred_standart = pipe_standart.predict(X_test)

acc_standart = accuracy_score(y_test, pred_standart)

print(acc_standart)
print(confusion_matrix(y_test, pred_standart))

0.89125
[[1799  220]
 [ 215 1766]]


In [None]:
vectorizer = pipe_standart.named_steps['vectorizer']
model = pipe_standart.named_steps['logistic_regress']

feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]

coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients
})

print("Топ-10 слов для ПОЛОЖИТЕЛЬНЫХ отзывов:")
print(coef_df.sort_values('coefficient', ascending=False).head(10))

print("\n")
print("Топ-10 слов для ОТРИЦАТЕЛЬНЫХ отзывов:")
print(coef_df.sort_values('coefficient', ascending=True).head(10))

Топ-20 слов для ПОЛОЖИТЕЛЬНЫХ отзывов:
        feature  coefficient
3385      great     6.246937
2658  excellent     4.722599
9801  wonderful     3.535491
6190    perfect     3.497532
1095       best     3.440686
4976      loved     3.263925
328     amazing     3.234522
378         and     3.189124
7936   the best     2.944554
1321  brilliant     2.780257


Топ-20 слов для ОТРИЦАТЕЛЬНЫХ отзывов:
        feature  coefficient
907         bad    -6.863064
9842      worst    -5.837760
890       awful    -5.391934
1196     boring    -5.064837
8371  the worst    -4.710332
9414      waste    -4.574579
7760   terrible    -4.423159
5686    nothing    -4.162815
6339       poor    -3.988298
7579     stupid    -3.596936


In [None]:
# param_grid = {
#     "vectorizer__max_features": [5000, 10000, ],
#     "vectorizer__ngram_range": [(1,1), (1,2)],
#     "logistic_regress__C": [0.1, 1]
# }

# grid_search = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1, scoring="accuracy")

# grid_search.fit(X_train, y_train)
# print(grid_search.best_params_)
# print(grid_search.best_score_)


# {'logistic_regress__C': 1, 'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 2)}
# 0.8830627607888785

In [89]:
pipe_stopwords = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words=custom_stopwords)),
    ("logistic_regress", LogisticRegression(max_iter=1000, C = 1))
])

pipe_stopwords .fit(X_train, y_train)
pred_stopwords = pipe_stopwords .predict(X_test)

acc_stopwords = accuracy_score(y_test, pred_stopwords)

print(acc_stopwords)
print(confusion_matrix(y_test, pred_stopwords))

0.88825
[[1785  234]
 [ 213 1768]]


In [91]:
pipe_lemma = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words=custom_stopwords, preprocessor=lemmatize_text)),
    ("logistic_regress", LogisticRegression(max_iter=1000, C = 1))
])

pipe_lemma .fit(X_train, y_train)
pred_lemma = pipe_lemma.predict(X_test)

acc_lemma = accuracy_score(y_test, pred_lemma)

print(acc_lemma)
print(confusion_matrix(y_test, pred_lemma))



0.8885
[[1789  230]
 [ 216 1765]]


In [114]:
vectorizer_lemma = pipe_lemma.named_steps['vectorizer']
model_lemma = pipe_lemma.named_steps["logistic_regress"]

feature_names_lemma = vectorizer_lemma.get_feature_names_out()
coefficients_lemma = model_lemma.coef_[0]

coef_df_lemma = pd.DataFrame({
    "features": feature_names_lemma,
    "coefficients": coefficients_lemma
})

print(f"Топ 10 слов для ПОЛОЖИТЕЛЬНЫХ отзывов")
print(coef_df_lemma.sort_values("coefficients", ascending=False).head(10))

print()
print(f"Топ 10 слов для ОТРИЦАТЕЛЬНЫХ отзывов")
print(coef_df_lemma.sort_values("coefficients", ascending=True).head(10))

Топ 10 слов для ПОЛОЖИТЕЛЬНЫХ отзывов
       features  coefficients
3939      great      6.369287
3124  excellent      5.164975
901        best      4.221973
5260      loved      3.971382
9811  wonderful      3.675030
6547    perfect      3.513304
449     amazing      3.468076
5246       love      3.288561
6241   one best      3.213526
1306  brilliant      3.072756

Топ 10 слов для ОТРИЦАТЕЛЬНЫХ отзывов
      features  coefficients
747        bad     -6.897049
9854     worst     -6.703153
720      awful     -5.682396
9608     waste     -5.155380
1054    boring     -5.051520
6130   nothing     -4.498059
8683  terrible     -4.363235
6742      poor     -3.980488
9851     worse     -3.804077
4312  horrible     -3.591422


In [115]:
pipe_stem = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words=custom_stopwords, preprocessor=stemmer_text)),
    ("logistic_regress", LogisticRegression(max_iter=1000, C = 1))
])

pipe_stem.fit(X_train, y_train)
pred_stem = pipe_stem.predict(X_test)

acc_stem = accuracy_score(y_test, pred_stem)


print(acc_stem)
print(confusion_matrix(y_test, pred_stem))



0.8835
[[1784  235]
 [ 231 1750]]


In [100]:
print(f"Standart model: {acc_standart}")
print(f"Standart model + stopwords: {acc_stopwords}")
print(f"Lemmatision model + stopwords: {acc_lemma}")
print(f"Stemping model + stopwords: {acc_stem}")

Standart model: 0.89125
Standart model + stopwords: 0.88825
Lemmatision model + stopwords: 0.8885
Stemping model + stopwords: 0.8835


Best result gives a **standart model**