### Amazon Reviews Sentiment Analysis Using Logistic Regression

In [1]:
import zipfile
import pandas as pd 
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV


### Extracting and Preparing Amazon Reviews Data for Analysis

In [3]:
prefix_1 = '__label__1'
prefix_2 = '__label__2'

file_path = 'datasets/train.ft.txt'

label_1_lines = []
label_2_lines = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.startswith(prefix_1):
            label_1_lines.append(line[len(prefix_1):])
        elif line.startswith(prefix_2):
            label_2_lines.append(line[len(prefix_2):])

df = pd.DataFrame({'label1':label_1_lines, 'label2':label_2_lines})
df.shape
df = df.iloc[:90000]

(1800000, 2)

### Cleaning Amazon Reviews Data by Removing Stopwords and Punctuation

In [5]:
stopwords = nltk.corpus.stopwords.words('english')

def clear_data(str):
   s = re.sub(r"\s+", ' ', str).strip().lower()
   s = re.sub(r'[^a-z\s]+', '', s)
   words = s.split()
   s = ' '.join([word for word in words if word not in stopwords])
   return s

for column in df.columns:
   df[column] = df[column].apply(clear_data)


### Splitting Data into Training and Test Sets

In [7]:
labels = [0] * df['label1'].notnull().sum() + [1] * df['label2'].notnull().sum()
len(labels)
all_text = pd.concat([df['label1'].dropna(), df['label2'].dropna()])
X_train, X_test, y_train, y_test = train_test_split(all_text, labels, test_size=.2, random_state=42)

180000

### Vectorizing  Data with TF-IDF

In [10]:
tfidf_vect = TfidfVectorizer()
X_train = tfidf_vect.fit_transform(X_train.values)
X_test = tfidf_vect.transform(X_test.values)

### Training and Predicting with Logistic Regression Model

In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

### Evaluating Logistic Regression Model Performance on Data

In [20]:
accuracy = accuracy_score(y_test, prediction)
precision = precision_score(y_test, prediction, average='binary')
recall = recall_score(y_test, prediction, average='binary')
f1 = f1_score(y_test,prediction, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.8866388888888889
Precision: 0.8818047256181131
Recall: 0.8931149361465852
F1-score: 0.8874237952056495


### Optimizing Logistic Regression Model with Grid Search CV

In [None]:
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best hyperparametr:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)

### Retrieving Best Logistic Regression Model from Grid Search

In [19]:
best_model = grid_search.best_estimator_
best_model