# Import and Read data

In [34]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os

import re

from pathlib import Path

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import PowerTransformer

from sklearn.model_selection import RandomizedSearchCV

[nltk_data] Downloading package stopwords to /Users/Anton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Anton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Anton/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Read data

In [2]:
main_dir = Path(os.path.abspath('')).parent
print(main_dir)

/Users/Anton/Documents/ds/new/final_project_epam_ds


In [3]:
main_dir = Path(os.path.abspath('')).parent
data_path = main_dir / 'data' / 'raw'
train_file_name = "train.csv"
test_file_name = "test.csv"

In [4]:
train_data = pd.read_csv(os.path.join(data_path, train_file_name))

In [5]:
test_data = pd.read_csv(os.path.join(data_path, test_file_name))

In [6]:
train_data.shape

(40000, 2)

# Data preprocessing

In [7]:
def preprocess_data(data, common_words):
    data['review'] = data['review'].apply(remove_url)
    data['review'] = data['review'].apply(remove_html_tags)
    data['review'] = data['review'].apply(remove_non_alphanumeric)
    data['review'] = data['review'].apply(convert_to_lowercase)
    data['tokens'] = data['review'].apply(tokenization)
    data['tokens'] = data['tokens'].apply(remove_short_words)
    data['tokens'] = data['tokens'].apply(remove_stopwords)
    data['tokens'] = data['tokens'].apply(lambda tokens: remove_common_words_from_tokens(tokens, common_words))
    data['tokens'] = data['tokens'].apply(lemmatize_tokens)
    return data

In [8]:
def remove_url(review_text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url_pattern, '', review_text)

In [9]:
def remove_html_tags(review_text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', review_text)

In [10]:
def remove_non_alphanumeric(review_text):
    return re.sub(r'[^a-zA-Z]', ' ', review_text)

In [11]:
def convert_to_lowercase(review_text):
    review_text = review_text.lower()
    return review_text

In [12]:
def tokenization(review_text):
    return word_tokenize(review_text)

In [13]:
def remove_short_words(tokens, min_length=3):
    result = [word for word in tokens if len(word) > min_length]
    return result

In [14]:
def remove_stopwords(tokens):
    STOPWORDS = set(stopwords.words('english'))
    result = [i for i in tokens if not i in STOPWORDS]
    return result

In [15]:
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

In [16]:
def remove_common_words_from_tokens(tokens, common_words):
    return [token for token in tokens if token.lower() not in common_words]

## Vectorization

In [17]:
common_words_path = main_dir / 'data' / 'processed'
common_words_file_name = "common_words.txt"

In [18]:
common_words_file_path = os.path.join(common_words_path, common_words_file_name)

# Read the content of the file
with open(common_words_file_path, 'r') as file:
    common_words_content = file.read()

# Split the content into a set of words
common_words = set(word for word in common_words_content.split('\n') if word)

In [19]:
train_data_tokens = preprocess_data(train_data, common_words)

In [20]:
test_data_tokens = preprocess_data(test_data, common_words)

In [21]:
train_data_tokens.head()

Unnamed: 0,review,sentiment,tokens
0,i caught this little gem totally by accident b...,positive,"[caught, little, totally, accident, back, revi..."
1,i can t believe that i let myself into this mo...,negative,"[believe, accomplish, favor, friend, early, ap..."
2,spoiler alert it just gets to me the nerve ...,negative,"[spoiler, alert, get, nerve, people, remake, t..."
3,if there s one thing i ve learnt from watching...,negative,"[thing, learnt, watching, george, romero, cree..."
4,i remember when this was in theaters reviews ...,negative,"[remember, theater, review, said, horrible, we..."


In [22]:
X_train = train_data_tokens['tokens']
X_test = test_data_tokens['tokens']
y_train = train_data_tokens['sentiment']
y_test = test_data_tokens['sentiment']

## Vectors initialization

In [23]:
def identity_tokenizer(text):
    return text

In [28]:
tfidf_vec = TfidfVectorizer (max_features=2000, min_df=7, max_df=0.8)
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

In [29]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(40000, 2000)
(10000, 2000)


# Models tuning

## SVM

In [None]:


parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=10, n_jobs=-1) ## `-1` run in parallel
clf.fit(X_train_bow, y_train)

In [None]:
#print(sorted(clf.cv_results_.keys()))
print(clf.best_params_)

In [None]:
print(clf.score(X_test_bow, y_test))

## Logistic regression

In [25]:
grid_lr={"max_iter" : [1000, 10000, 100000], "C":np.logspace(-3, 3, 7), "penalty":["l1","l2"]}

In [30]:
logreg=LogisticRegression(max_iter = 100000, solver='liblinear', random_state = 42)
logreg_cv=GridSearchCV(logreg,grid_lr,cv=4)
logreg_cv.fit(X_train_tfidf,y_train)

In [31]:
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 1.0, 'max_iter': 1000, 'penalty': 'l1'}
accuracy : 0.8768


In [38]:
best_logreg = LogisticRegression(solver='liblinear', random_state = 42, C = 1.0, max_iter = 1000, penalty = 'l1')

In [39]:
best_logreg.fit(X_train_tfidf,y_train)

In [40]:
logreg_predictions = best_logreg.predict(X_test_tfidf)

In [None]:
## Decision tree

In [51]:
dt = DecisionTreeClassifier(random_state=42)

In [42]:
grid_dt = {
    'max_depth': [5, 10, 20, 25, 30, 35],
    'min_samples_leaf': [10, 20, 50, 100, 120, 140, 160, 180, 200],
    'criterion': ["gini", "entropy"]
}

In [44]:
grid_search_dt = GridSearchCV(estimator=dt, 
                           param_grid=grid_dt, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [None]:
grid_search_dt.fit(X_train_tfidf,y_train)

In [49]:
dt.fit(X_train_tfidf,y_train)

In [55]:
random_search = RandomizedSearchCV(
    dt, 
    param_distributions=grid_dt, 
    n_iter=10,  # Number of random combinations to try
    cv=3,       # Number of cross-validation folds
    scoring='accuracy',  # Use an appropriate scoring metric
    random_state=42,     # Set a random seed for reproducibility
    n_jobs=-1            # Use all available CPU cores for parallel computation
)

In [56]:
random_search.fit(X_train_tfidf,y_train)

In [57]:
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'min_samples_leaf': 100, 'max_depth': 25, 'criterion': 'gini'}


In [58]:
best_dt = random_search.best_estimator_

In [59]:
dt_predictions = best_dt.predict(X_test_tfidf)

In [30]:
nb = GaussianNB()

In [31]:
cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=42)

In [32]:
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

gs_NB = GridSearchCV(estimator=nb, 
                     param_grid=params_NB, 
                     cv=cv_method,
                     verbose=1, 
                     scoring='accuracy')

In [35]:
random_search_nb = RandomizedSearchCV(
    nb, 
    param_distributions=params_NB, 
    n_iter=10,  # Number of random combinations to try
    cv=cv_method,       # Number of cross-validation folds
    scoring='accuracy',  # Use an appropriate scoring metric
    random_state=42,     # Set a random seed for reproducibility
    n_jobs=-1            # Use all available CPU cores for parallel computation
)

In [36]:
random_search_nb.fit(X_train_tfidf.toarray(),y_train)



In [37]:
print("Best Hyperparameters:", random_search_nb.best_params_)

Best Hyperparameters: {'var_smoothing': 0.12328467394420659}


In [77]:
best_nb = random_search_nb.best_estimator_

In [81]:
best_nb.predict(X_test_tfidf.toarray())

array(['negative', 'positive', 'positive', ..., 'negative', 'positive',
       'positive'], dtype='<U8')

In [84]:
from sklearn.metrics import accuracy_score, recall_score, f1_score

In [89]:
def calc_metrics(model):
    prediction = model.predict(X_test_tfidf.toarray())
    accuracy = accuracy_score(y_test, prediction)
    print(f"Accuracy: {round(accuracy*100, 1)}%")
    pos_label = 'negative'

    recall = recall_score(y_test, prediction, pos_label=pos_label)
    print(f"Recall: {round(recall*100,1)}%")
    
    f1 = f1_score(y_test, prediction, pos_label=pos_label)
    print(f"F1-score: {round(f1*100, 1)}%")
    

In [90]:
calc_metrics(best_nb)

Accuracy: 82.5%
Recall: 83.6%
F1-score: 82.7%


In [91]:
calc_metrics(best_dt)

Accuracy: 75.5%
Recall: 70.8%
F1-score: 74.3%


In [92]:
calc_metrics(best_logreg)

Accuracy: 88.0%
Recall: 87.6%
F1-score: 88.0%
