# Data Analysis for IMDB movie reviews

In [42]:
import pandas as pd
from app.preprocessing import TextPreprocessor, clean_text
from sklearn.model_selection import train_test_split
from app.model import SentimentAnalysisModel

In [43]:
df = pd.read_csv("../dataset/IMDB Dataset.csv")

## Exploratory Data Analysis (EDA)

In [None]:
print("Head: ", df.head)
print("Shape: ", df.shape)

In [None]:
# checking for missing data
len(df["review"]) == len(df["sentiment"])

Length of both columns is equal, thus **no missing data**. We can move on to cleaning and training the data

### Cleaning the reviews column

In [None]:
preprocessor = TextPreprocessor()
df["cleaned_review"] = df["review"].apply(clean_text)
df.head

### Splitting training and testing data

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
  df["cleaned_review"], df["sentiment"].map({"positive": 1, "negative": 0})
)

In [None]:
train_df = pd.DataFrame({
    'review': X_train,
    'sentiment': y_train
})

test_df = pd.DataFrame({
    'review': X_test,
    'sentiment': y_test
})

# Display first few rows of training data
print("Training Data:")
print(train_df.head())
print("\nTraining Data Shape:", train_df.shape)

# Display first few rows of test data
print("\nTest Data:")
print(test_df.head())
print("\nTest Data Shape:", test_df.shape)

### Vectorization

In [49]:
X_train_tfidf = preprocessor.fit_transform(X_train)
X_test_tfidf = preprocessor.transform(X_test)

### Training and evaulation of model

In [50]:
model = SentimentAnalysisModel()

model.train(X_train_tfidf, y_train)
accuracy, classification_report = model.evaluate(X_test_tfidf, y_test)

In [None]:
print("Model Performance Metrics")
print("=========================")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n")
print(classification_report)

## For Deployment

In [53]:
import pickle

with open("models/tfidf_vectorizer.pkl", "wb") as vec_file, open("models/sentiment_analysis_model.pkl", "wb") as model_file:
    pickle.dump(preprocessor.vectorizer, vec_file)
    pickle.dump(model.model, model_file)