https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Класифікація на позитивні та негативні відгуки
Зробити сентімент аналіз

## INIT BLOCK

### Import and install Requirements

### install

In [None]:
!pip install eng_spacysentiment

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!python -m spacy download en_core_web_md

In [None]:
!pip install asent

In [None]:
!pip install spacytextblob

### import

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import (TfidfTransformer, TfidfVectorizer, HashingVectorizer)
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.metrics import (r2_score, mean_squared_error, mean_absolute_error, accuracy_score, classification_report,
                             precision_score, recall_score, f1_score)
from sklearn.model_selection import (train_test_split, GridSearchCV)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import (SGDClassifier, LogisticRegression)
import spacy
import asent
import concurrent.futures
from spacytextblob.spacytextblob import SpacyTextBlob
import eng_spacysentiment

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk import tokenize, WordNetLemmatizer, word_tokenize

### Init

In [None]:
RANDOM_STATE: int = 1729


def mount_google_drive() -> bool | None:
    """
    Function to mount Google Drive.
    :return: True if mounting is successful, None otherwise.
    """
    try:
        from google.colab.drive import mount

        mount(mountpoint="/content/drive")
        return True

    except Exception as error:
        print(f"Error while mounting Google Drive: {error}")
        raise


def get_data_frame(dataset_path: str, *args, sep: str = ",", **kwargs) -> pd.DataFrame | None:
    """
    Function to convert the dataset into a pd.DataFrame.
    :param sep: Separator for csv data, default - ",".
    :param dataset_path: Path or URL of the dataset.
    :return: pd.DataFrame containing the dataset, or None if errors occur.
    """
    try:
        return pd.read_csv(filepath_or_buffer=dataset_path, sep=sep, **kwargs)

    except Exception as error:
        print(f"Error while converting dataset to NumPy array: {error}")
        raise


def get_dataset_url():
    return "/content/drive/MyDrive/Hillel/Machine_Learning_Course/HW13/IMDB Dataset.csv"


def main(dataset_path: str, *args, **kwargs) -> pd.DataFrame | None:
    """
    Main function to start the app.
    :return: pd.DataFrame containing the dataset, or None if errors occur.
    """
    if mount_google_drive():
        import os

        if os.path.exists(path=dataset_path):
            return get_data_frame(dataset_path=dataset_path, *args, **kwargs)

        else:
            print("Dataset path doesn't exists.")
            raise FileNotFoundError


if __name__ == "__main__":
    source_dataframe: pd.DataFrame | None = main(dataset_path=get_dataset_url(), sep=",")

    if isinstance(source_dataframe, pd.DataFrame):
        working_dataframe: pd.DataFrame = source_dataframe.copy()

        feature: str = "review"
        target: str = "sentiment"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# DATA ANALYSIS, CLINING AND OPTOMIZATION

### DataFrame info

In [None]:
working_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


### DataFrame head

In [None]:
working_dataframe.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### Обробка відсутніх значеннь

In [None]:
print(f"{'Є пропущені значення, потрібна обробка датасету.' if working_dataframe.isna().any().any() else 'Пропущених значень немає.'}", end="\n\n")

Пропущених значень немає.



### Видаляємо зайвий текст

In [None]:
working_dataframe[feature]: pd.Series = working_dataframe[feature].str.replace("<br />", "")

### Замінюємо значення таргету на: negative -> 0, та positive -> 1

In [None]:
working_dataframe[target] = working_dataframe[target].replace({"negative": 0, "positive": 1})

### Перевіряємо баланс класів

In [None]:
print(f"{working_dataframe[target].value_counts()}")

1    25000
0    25000
Name: sentiment, dtype: int64


Балансування не потрібно, баланс ідеальний: 50/50.

### Генеруємо статистику

In [None]:
working_dataframe.describe(include="all", percentiles=[.25, .5, .75])

Unnamed: 0,review,sentiment
count,50000,50000.0
unique,49581,
top,Loved today's show!!! It was a variety and not...,
freq,5,
mean,,0.5
std,,0.500005
min,,0.0
25%,,0.0
50%,,0.5
75%,,1.0


# FEATURE ENGINEERING

In [None]:
vectorizer_params: dict = {
    "input": "content",
    "encoding": "utf-8",
    "decode_error": "strict",
    "strip_accents": None,
    "lowercase": True,
    "preprocessor": None,
    "tokenizer": None,
    "stop_words": "english",
    "token_pattern": r"(?u)\b\w\w+\b",
    "ngram_range": (1, 1),
    "analyzer": "word",
    "max_df": 1.0,
    "min_df": 1,
    "max_features": None,
    "vocabulary": None,
    "binary": False,
    "dtype": np.int64
}
vectorizer: CountVectorizer = CountVectorizer(**vectorizer_params)
word_matrix: csr_matrix = vectorizer.fit_transform(working_dataframe.copy()[feature])

# PREPARING DATA FOR TRAINING

### Розбиваємо дані на тренувальну, валідаційну та тестову частину

In [None]:
x_train, x_test, y_train, y_test = train_test_split(word_matrix, working_dataframe[target], test_size=0.2, random_state=RANDOM_STATE, stratify=working_dataframe[target])
x_validate, x_test, y_validate, y_test = train_test_split(x_test, y_test, test_size=0.2, random_state=RANDOM_STATE, stratify=y_test)

print(x_train.shape)
print(x_validate.shape)
print(x_test.shape)
print(y_train.shape)
print(y_validate.shape)
print(y_test.shape)

(40000, 103777)
(8000, 103777)
(2000, 103777)
(40000,)
(8000,)
(2000,)


# FIT PREDICT BAG OF WORDS (SKLEARN)

### Functions for fit models, make predict and generate metrics

In [None]:
def print_model_name(name: str) -> None:
    print("'" * 60)
    print(f"MODEL: {name}")

In [None]:
def value_results_main(predict, y_test):
    accuracy = accuracy_score(y_true=y_test, y_pred=predict)
    precision = precision_score(y_true=y_test, y_pred=predict, average="micro")
    recall = recall_score(y_true=y_test, y_pred=predict, average="micro")
    f1 = f1_score(y_true=y_test, y_pred=predict, average="micro")

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

In [None]:
def value_regression(predict, y_test):
    model_r2: np.float64 = r2_score(y_true=y_test, y_pred=predict)
    model_mean_squared_error: np.float64 = mean_squared_error(y_true=y_test, y_pred=predict)
    model_mean_absolute_error: np.float64 = mean_absolute_error(y_true=y_test, y_pred=predict)

    print(f"R2:                         {model_r2}")
    print(f"Mean_squared_error (MSE):   {model_mean_squared_error}")
    print(f"Mean_absolute_error (MAE):  {model_mean_absolute_error}")

In [None]:
def value_results(name:str, predict, y_test=y_test):
    print_model_name(name=name)
    try:
      value_results_main(predict=predict, y_test=y_test)
    except ValueError:
      value_regression(predict=predict, y_test=y_test)

    print("'" * 60)

In [None]:
def train_model(model_class, name: str, y_train=y_train, y_validate=y_validate, grid_params=None):
    print_model_name(name=name)

    if grid_params:
        model: GridSearchCV = GridSearchCV(estimator=model_class, param_grid=grid_params).fit(X=x_train, y=y_train)

        print("Best params: ", model.best_params_)
        print("Best score: ", model.best_score_)

    else:
        model = model_class.fit(X=x_train, y=y_train)

    y_pred = model.predict(X=x_validate)

    try:
        print(f"Classification report:\n{classification_report(y_true=y_validate, y_pred=y_pred)}")
        value_results_main(predict=y_pred, y_test=y_validate)

    except ValueError:
        value_regression(predict=y_pred, y_test=y_validate)

    return model


## Train

In [None]:
knn_classifier: KNeighborsClassifier = train_model(model_class=KNeighborsClassifier(), name="KNeighborsClassifier with default hyperparameters")

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: KNeighborsClassifier with default hyperparameters
Classification report:
              precision    recall  f1-score   support

           0       0.60      0.66      0.63      4000
           1       0.62      0.56      0.59      4000

    accuracy                           0.61      8000
   macro avg       0.61      0.61      0.61      8000
weighted avg       0.61      0.61      0.61      8000

Accuracy: 0.60975
Precision: 0.60975
Recall: 0.60975
F1 Score: 0.60975


In [None]:
random_forest_classifier: RandomForestClassifier = train_model(model_class=RandomForestClassifier(), name="RandomForestClassifier with default hyperparameters")

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: RandomForestClassifier with default hyperparameters
Classification report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      4000
           1       0.86      0.86      0.86      4000

    accuracy                           0.86      8000
   macro avg       0.86      0.86      0.86      8000
weighted avg       0.86      0.86      0.86      8000

Accuracy: 0.861125
Precision: 0.861125
Recall: 0.861125
F1 Score: 0.861125


In [None]:
sgd_classifier: SGDClassifier = train_model(model_class=SGDClassifier(), name="SGDClassifier with default hyperparameters")

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: SGDClassifier with default hyperparameters
Classification report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4000
           1       0.88      0.88      0.88      4000

    accuracy                           0.88      8000
   macro avg       0.88      0.88      0.88      8000
weighted avg       0.88      0.88      0.88      8000

Accuracy: 0.881125
Precision: 0.881125
Recall: 0.881125
F1 Score: 0.881125


In [None]:
logistic_classifier: LogisticRegression = train_model(model_class=LogisticRegression(), name="LogisticRegression with default hyperparameters")

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: LogisticRegression with default hyperparameters
Classification report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4000
           1       0.88      0.90      0.89      4000

    accuracy                           0.89      8000
   macro avg       0.89      0.89      0.89      8000
weighted avg       0.89      0.89      0.89      8000

Accuracy: 0.889375
Precision: 0.889375
Recall: 0.889375
F1 Score: 0.889375


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Порівняння результатів

In [None]:
value_results(name="KNeighborsClassifier with default hyperparameters", predict=knn_classifier.predict(X=x_test))
value_results(name="RandomForestClassifier with default hyperparameters", predict=random_forest_classifier.predict(X=x_test))
value_results(name="SGDClassifier with default hyperparameters", predict=sgd_classifier.predict(X=x_test))
value_results(name="LogisticRegression with default hyperparameters", predict=logistic_classifier.predict(X=x_test))

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: KNeighborsClassifier with default hyperparameters
Accuracy: 0.6245
Precision: 0.6245
Recall: 0.6245
F1 Score: 0.6245
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: RandomForestClassifier with default hyperparameters
Accuracy: 0.86
Precision: 0.86
Recall: 0.86
F1 Score: 0.8599999999999999
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: SGDClassifier with default hyperparameters
Accuracy: 0.8765
Precision: 0.8765
Recall: 0.8765
F1 Score: 0.8765
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
MODEL: LogisticRegression with default hyperparameters
Accuracy: 0.8855
Precision: 0.8855
Recall: 0.8855
F1 Score: 0.8855
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''


# Helpers

In [None]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def compare_sentimets(dataframe, new_column):
    matching_count = (dataframe["sentiment"] == new_column).sum()
    total_count = len(dataframe)
    percent_matching = (matching_count / total_count) * 100

    print(f"Процент співпадіння: {percent_matching:.2f}%")

# SpaCy

### ASENT

In [None]:
asent_dataframe: pd.DataFrame = working_dataframe.copy()

def analyze_sentiment(text):
    doc = nlp(text)

    if doc._.polarity.compound > 0:
        return 1
    elif doc._.polarity.compound < 0:
        return 0
    else:
        return "neutral"


nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")
nlp.add_pipe("asent_en_v1")


with concurrent.futures.ThreadPoolExecutor() as executor:
    asent_result = list(executor.map(analyze_sentiment, asent_dataframe["review"]))


asent_dataframe["ASENT"] = asent_result

In [None]:
compare_sentimets(dataframe=asent_dataframe, new_column=asent_dataframe["ASENT"])

Процент співпадіння: 69.46%


### SpacyTextBlob


In [None]:
spacytextblob_dataframe: pd.DataFrame = working_dataframe.copy()

nlp = spacy.load("en_core_web_md")
nlp.add_pipe("spacytextblob")


def analyze_sentiment(text):
    doc = nlp(text)

    if doc._.blob.polarity > 0:
        return 1
    elif doc._.blob.polarity < 0:
        return 0
    else:
        return "neutral"


with concurrent.futures.ThreadPoolExecutor() as executor:
    spacy_text_blob_result = list(executor.map(analyze_sentiment, spacytextblob_dataframe["review"]))


spacytextblob_dataframe["SPACY_TEXT_BLOB"] = spacy_text_blob_result

In [None]:
compare_sentimets(dataframe=spacytextblob_dataframe, new_column=spacytextblob_dataframe["SPACY_TEXT_BLOB"])

Процент співпадіння: 68.83%


### eng spacysentiment

In [None]:
eng_spacysentiment_dataframe: pd.DataFrame = working_dataframe.copy()

nlp = eng_spacysentiment.load()

def get_max_sentiment(text):
    doc = nlp(text)

    max_sentiment = max(doc.cats, key=doc.cats.get)
    if max_sentiment == "positive":
        return 1
    elif max_sentiment == "negative":
        return 0
    return max_sentiment


with concurrent.futures.ThreadPoolExecutor() as executor:
    eng_spacysentiment_result = list(executor.map(get_max_sentiment, eng_spacysentiment_dataframe["review"]))


eng_spacysentiment_dataframe["ENG_SPACY_SENTIMENT"] = eng_spacysentiment_result

In [None]:
compare_sentimets(dataframe=eng_spacysentiment_dataframe, new_column=eng_spacysentiment_dataframe["ENG_SPACY_SENTIMENT"])

Процент співпадіння: 58.45%


# NLTK

In [None]:
nltk.download('vader_lexicon')
nltk.download('punkt')

nltk_df: pd.DataFrame = working_dataframe.copy()

sid = SentimentIntensityAnalyzer()

def make_sentiment(text):
    text = preprocess_text(text)
    sum_sentiment = 0
    for sentence in tokenize.sent_tokenize(text):
        sum_sentiment += sid.polarity_scores(sentence)["compound"]

    if sum_sentiment > 0:
        return 1
    elif sum_sentiment < 0:
        return 0

    return "neutral"


with concurrent.futures.ThreadPoolExecutor() as executor:
    nltk_sentiment_result = list(executor.map(make_sentiment, nltk_df["review"]))

nltk_df["NLTK_SENTIMENT"] = nltk_sentiment_result

In [None]:
compare_sentimets(dataframe=nltk_df, new_column=nltk_df["NLTK_SENTIMENT"])

Процент співпадіння: 70.36%


# Висновок

1. Побудував 4 моделі-класифікатора
(KNeighborsClassifier, RandomForestClassifier, SGDClassifier, LogisticRegression) в звʼязку з CountVectorizer. Найкращій скор дає модель LogisticRegression: 0.88

2. Зробив сентімент аналіз за допомогою бібліотеки SpaCy та додатковик розширень:
* spacy + asent. Процент співпадіння з таргетом: 69.46%
* spacy + SpacyTextBlob. Процент співпадіння з таргетом: 68.83%
* spacy + eng_spacysentiment. Процент співпадіння з таргетом: 58.45%

3. Зробив сентімент аналіз за допомогою бібліотеки NLTK. Процент співпадіння з таргетом: 70.36%