# Sentiment Analysis Project

This notebook demonstrates a complete workflow for binary sentiment classification on movie reviews. It includes data exploration, text preprocessing, feature engineering, model training, and evaluation steps.

### Steps:
- Exploratory Data Analysis (EDA)
- Text Preprocessing
- Model Training and Evaluation
- Business Insights


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


## Load and Explore Dataset

In [None]:
df = pd.read_csv('data/train.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Exploratory Data Analysis (EDA)

### 1. Class Distribution

In [None]:
sns.countplot(x='sentiment', data=df)
plt.title('Class Distribution')
plt.show()

### 2. Text Length Distribution

In [None]:
df['text_length'] = df['review'].apply(len)
sns.histplot(df['text_length'], bins=50)
plt.title('Text Length Distribution')
plt.show()

### 3. Common Words Analysis

In [None]:
all_words = ' '.join(df['review']).split()
word_freq = Counter(all_words)
common_words = word_freq.most_common(20)

common_words_df = pd.DataFrame(common_words, columns=['Word', 'Frequency'])
sns.barplot(x='Frequency', y='Word', data=common_words_df)
plt.title('Top 20 Most Common Words')
plt.show()

### 4. Word Clouds for Positive and Negative Reviews

In [None]:
positive_reviews = ' '.join(df[df['sentiment'] == 'positive']['review'].tolist())
negative_reviews = ' '.join(df[df['sentiment'] == 'negative']['review'].tolist())

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Positive Reviews WordCloud')
plt.imshow(WordCloud(width=300, height=200, background_color='white').generate(positive_reviews), interpolation='bilinear')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.title('Negative Reviews WordCloud')
plt.imshow(WordCloud(width=300, height=200, background_color='white').generate(negative_reviews), interpolation='bilinear')
plt.axis('off')

plt.show()

## Text Preprocessing

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

df['tokens'] = df['review'].apply(preprocess_text)
df[['review', 'tokens']].head()

### Feature Engineering: Embeddings

In [None]:
embeddings_index = {}
with open('data/glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Loaded word vectors:', len(embeddings_index))

def get_average_embedding(tokens):
    valid_embeddings = [embeddings_index[word] for word in tokens if word in embeddings_index]
    if not valid_embeddings:
        return np.zeros(50)
    return np.mean(valid_embeddings, axis=0)

df['embedding'] = df['tokens'].apply(get_average_embedding)

train_x = np.vstack(df['embedding'].values)
train_y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values


## Model Training and Evaluation

In [None]:
train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

models = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

model_performance = {}

for model_name, model in models.items():
    model.fit(train_x, train_y)
    predictions = model.predict(test_x)
    accuracy = accuracy_score(test_y, predictions)
    precision = precision_score(test_y, predictions)
    recall = recall_score(test_y, predictions)
    f1 = f1_score(test_y, predictions)

    model_performance[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

model_performance

## Save Best Model

In [None]:
best_model_name = max(model_performance, key=lambda name: model_performance[name]['accuracy'])
best_model = models[best_model_name]

import joblib
joblib.dump(best_model, 'outputs/best_model.pkl')

print(f'Saved the best model ({best_model_name}) with accuracy: {model_performance[best_model_name]['accuracy']}')

## Conclusion and Business Insights

- The best-performing model for binary sentiment classification was saved for deployment.
- This model can be valuable in applications like review aggregation, customer feedback analysis, and understanding user sentiment in real-time.
- By automating sentiment classification, businesses can better gauge public opinion and respond to trends more effectively.