# Text Classification Assessment (Key)

## Objective
Classify customer reviews into positive or negative sentiment.

## 1. Data Loading and Preprocessing

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('customer_reviews.csv')

# Create binary target variable
df['sentiment'] = (df['rating'] >= 4).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review_text'], df['sentiment'], test_size=0.2, random_state=42)

## 2. Exploratory Data Analysis

In [None]:
# Display the first few rows of the dataset
print(df.head())

# Show the distribution of positive and negative reviews
print(df['sentiment'].value_counts(normalize=True))

# Calculate and display the average length of reviews for each class
df['review_length'] = df['review_text'].str.len()
print(df.groupby('sentiment')['review_length'].mean())

## 3. Feature Engineering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Explanation
'''
I chose TfidfVectorizer over CountVectorizer because it not only considers the frequency of words 
but also their importance in the corpus. This can help in giving less weight to common words 
that appear in many documents but may not be as informative for classification.
'''

## 4. Model Selection and Training

In [None]:
from sklearn.linear_model import LogisticRegression

# Choose and train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_vectorized, y_train)

# Explanation
'''
I chose Logistic Regression because:
1. It's suitable for binary classification problems like sentiment analysis.
2. It's relatively simple and interpretable.
3. It often performs well on text classification tasks, especially with high-dimensional data.
4. It's computationally efficient for both training and prediction.
'''

## 5. Making Predictions

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_test_vectorized)

# Calculate and display the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print a few example predictions along with their actual labels
for i in range(5):
    print(f'Review: {X_test.iloc[i]}')
    print(f'Actual sentiment: {y_test.iloc[i]}')
    print(f'Predicted sentiment: {y_pred[i]}\n')