## Import Required Libraries


In [11]:
# Import core data manipulation libraries
import pandas as pd
import numpy as np

# Import visualization libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Import machine learning tools from scikit-learn
from sklearn.model_selection import train_test_split  # For splitting data into train/test sets
from sklearn.feature_extraction.text import CountVectorizer  # For converting text to numerical features
from sklearn.linear_model import LogisticRegression  # For logistic regression classification model
from sklearn.metrics import accuracy_score  # For evaluating model performance


## Load the Dataset

Load the women's clothing e-commerce reviews dataset and display the first few rows to understand the data structure.


In [12]:
# Load the women's clothing e-commerce reviews dataset from CSV file
# This dataset contains customer review text and sentiment labels (1 for positive, -1 for negative)
df = pd.read_csv("../womens_clothing_ecommerce_reviews.csv")

# Display the first 5 rows to understand the data structure
print("The First 5 Rows Of The Dataset")
print(df.head())


The First 5 Rows Of The Dataset
                                         Review Text  sentiment
0  Absolutely wonderful - silky and sexy and comf...          1
1  Love this dress!  it's sooo pretty.  i happene...          1
2  I love, love, love this jumpsuit. it's fun, fl...          1
3  This shirt is very flattering to all due to th...          1
4  I love tracy reese dresses, but this one is no...         -1


## Dataset Information

Display summary information about the dataset including data types, non-null counts, and memory usage.


In [13]:
# Display summary information about the dataset
# This shows the number of rows, columns, data types, and memory usage
print("\nDataset Information:")
print(df.info())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19818 entries, 0 to 19817
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review Text  19818 non-null  object
 1   sentiment    19818 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 309.8+ KB
None


## Define Features and Target Variable

Select the independent variable (Review Text) and dependent variable (sentiment) for our classification model.


In [14]:
# Define our features (X) and target variable (Y)
# X: Independent variable - Customer review text (raw text data)
# Y: Dependent variable - Sentiment labels (1 = positive, -1 = negative)
# We'll use text classification to predict sentiment based on review content

X = df['Review Text']  # Feature variable containing review text
Y = df['sentiment']  # Target variable indicating sentiment (positive/negative)

# Verify that both are pandas Series objects
print(type(X))
print(type(Y))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


## Split Data into Training and Testing Sets

Divide the dataset into training (80%) and testing (20%) sets with stratification to maintain class balance.


In [15]:
# Split the data into training (80%) and testing (20%) sets
# random_state=42 ensures reproducibility - we get the same split every time
# stratify=Y maintains the proportion of positive/negative reviews in both sets
# Training set: Used to train the model
# Testing set: Used to evaluate model performance on unseen data

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

# stratify=Y explanation:
# This ensures that the proportion of classes in Y is maintained in both training and testing sets
# Example: If original data has 80% positive and 20% negative, both train/test will have same ratio

# Display the shapes of the resulting datasets
print("\nShapes of Train/Test Datasets:")
print("X_train shape:", x_train.shape)
print("Y_train shape:", y_train.shape)
print("X_test shape:", x_test.shape)
print("Y_test shape:", y_test.shape)


Shapes of Train/Test Datasets:
X_train shape: (15854,)
Y_train shape: (15854,)
X_test shape: (3964,)
Y_test shape: (3964,)


## Text Vectorization (Bag of Words)

Convert text data into numerical feature vectors using CountVectorizer with stop words removal.


In [16]:
# Initialize CountVectorizer to convert text data into numerical feature vectors
# CountVectorizer creates a "Bag of Words" representation:
# Example: ["this is a sample", "another example here"] -> [[1,1,1,1,0,0,0],[1,0,0,0,1,1,1]]
# Vocabulary: { "this":0, "is":1, "a":2, "sample":3, "another":4, "example":5, "here":6 }

vectorizer = CountVectorizer(stop_words='english')  # Remove common English stop words
# English stop words are common words like "the", "is", "in", "and", etc.
# These words may not add significant meaning to sentiment analysis

# Fit the vectorizer on the training data and transform both train and test data
# fit_transform: Learn vocabulary from training data and convert to vectors
# transform: Convert test data using the same vocabulary (no new words added)
x_train_bow = vectorizer.fit_transform(x_train)
x_test_bow = vectorizer.transform(x_test)

# Display the sparse matrix representation and shapes
# Sparse matrix saves memory by only storing non-zero values
print(x_train_bow)
print(x_train_bow.shape)  # (15854 reviews, 11897 unique words in vocabulary)
print(x_test_bow.shape)   # (3964 reviews, same 11897 unique words)


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 373991 stored elements and shape (15854, 11897)>
  Coords	Values
  (0, 10214)	1
  (0, 2396)	1
  (0, 4805)	1
  (0, 4210)	1
  (0, 3835)	1
  (0, 9289)	1
  (0, 3680)	1
  (0, 4669)	1
  (0, 6980)	1
  (0, 10773)	1
  (0, 7248)	1
  (0, 7409)	1
  (1, 9289)	1
  (1, 2866)	1
  (1, 3454)	1
  (1, 4206)	1
  (1, 1781)	1
  (1, 6221)	1
  (1, 6711)	1
  (1, 8861)	1
  (1, 9671)	1
  (1, 9424)	1
  (1, 11)	1
  (2, 7409)	1
  (2, 3454)	3
  :	:
  (15853, 4805)	1
  (15853, 7596)	1
  (15853, 6257)	1
  (15853, 2390)	1
  (15853, 11495)	1
  (15853, 1602)	1
  (15853, 8053)	1
  (15853, 4742)	1
  (15853, 8467)	1
  (15853, 7829)	1
  (15853, 9466)	1
  (15853, 10594)	1
  (15853, 11408)	1
  (15853, 11414)	1
  (15853, 8195)	1
  (15853, 584)	1
  (15853, 11438)	1
  (15853, 5070)	1
  (15853, 11519)	1
  (15853, 9280)	1
  (15853, 9273)	1
  (15853, 5103)	1
  (15853, 3990)	1
  (15853, 521)	1
  (15853, 4857)	1
(15854, 11897)
(3964, 11897)


## Train the Logistic Regression Model

Create and train a Logistic Regression classifier for sentiment analysis.


In [17]:
# Initialize the Logistic Regression model
# Logistic Regression is a linear classification algorithm that predicts probabilities
# solver='saga': Suitable for large datasets and supports L1/L2 regularization
# max_iter=5000: Maximum number of iterations to ensure convergence
# random_state=42: Ensures reproducibility of results

model = LogisticRegression(max_iter=5000, random_state=42, solver='saga')

# Train the model on the training data
# The model learns the relationship between word frequencies and sentiment
# It finds optimal weights for each word to classify reviews as positive or negative
model.fit(x_train_bow, y_train)

print("\nModel Training Completed.")



Model Training Completed.


## Evaluate Model Performance

Make predictions on the test set and calculate the accuracy score.


In [18]:
# Make predictions on the test data
# The model predicts sentiment (1 or -1) for each review in the test set
y_pred = model.predict(x_test_bow)

# Evaluate the model's accuracy
# Accuracy: Percentage of correctly classified reviews
# Formula: (Number of correct predictions) / (Total number of predictions)
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy on Test Set: {:.2f}%".format(accuracy * 100))


Model Accuracy on Test Set: 92.99%


## Prepare New Review Data for Prediction

Create a set of new, unseen customer reviews to test the trained model.


In [19]:
# New unseen dataset of customer reviews
# These reviews were not part of the training or testing data
# We'll use them to demonstrate the model's ability to classify real-world reviews

new_reviews = [
    "I absolutely love this dress! The fit is perfect and the material is so soft.",
    "The shirt shrunk after one wash. Very disappointed with the quality.",
    "These pants are okay, but I've seen better for the price.",
    "Fantastic quality and great value for money. Highly recommend!",
    "The color faded quickly and the fabric feels cheap."
]

## Predict Sentiment for New Reviews

Use the trained model to predict sentiment for new customer reviews.


In [20]:
# Transform new reviews using the same vectorizer fitted on training data
# This converts the text reviews into the same numerical format (Bag of Words)
new_reviews_bow = vectorizer.transform(new_reviews)

# Make predictions on the new reviews
# The model will classify each review as positive (1) or negative (-1)
new_predictions = model.predict(new_reviews_bow)

# Display the predictions for each review
print("\nPredictions for New Reviews:")
for review, sentiment in zip(new_reviews, new_predictions):
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")


Predictions for New Reviews:
Review: I absolutely love this dress! The fit is perfect and the material is so soft.
Predicted Sentiment: 1

Review: The shirt shrunk after one wash. Very disappointed with the quality.
Predicted Sentiment: -1

Review: These pants are okay, but I've seen better for the price.
Predicted Sentiment: 1

Review: Fantastic quality and great value for money. Highly recommend!
Predicted Sentiment: 1

Review: The color faded quickly and the fabric feels cheap.
Predicted Sentiment: -1

