In [7]:
!pip install pandas scikit-learn -q

# Import libraries
import pandas as pd
import numpy as np
import re
from google.colab import files
import io

# Machine Learning imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# ======================================
# 1. DATA UPLOAD & PREPROCESSING
# ======================================

def upload_and_clean_data():
    """Handles file upload and basic cleaning"""
    print("STEP 1: UPLOAD YOUR DATASET")
    print("➡ Please upload a CSV/Excel file with your 6-column dataset")

    # File upload
    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]

    # Read file
    if file_name.endswith('.csv'):
        df = pd.read_csv(io.BytesIO(uploaded[file_name]))
    else:  # Excel
        df = pd.read_excel(io.BytesIO(uploaded[file_name]))

    # Check if we have the expected columns
    expected_columns = ['review','tokens', 'lemmatized', 'review_length', 'token_count', 'lemma_count']
    for col in expected_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in dataset")

    # Basic cleaning (if needed)
    print("\nSample of your data:")
    display(df.head(3))

    # Let user select which text column to use
    print("\nAvailable text columns:")
    text_options = ['review', 'lemmatized']
    for i, option in enumerate(text_options, 1):
        print(f"{i}. {option}")

    choice = int(input("Select which text column to use (1-2): ")) - 1
    text_column = text_options[choice]

    # Let user select target column (assuming last column is sentiment)
    target_column = df.columns[-1]  # Assuming last column is the label
    print(f"\nUsing '{text_column}' as text features and '{target_column}' as target variable")

    return df, text_column, target_column

# ======================================
# 2. MODEL TRAINING & EVALUATION
# ======================================

def train_models(df, text_column, target_column):
    """Trains and compares three models"""
    print("\nSTEP 2: MODEL TRAINING")

    # Prepare data
    X_train, X_test, y_train, y_test = train_test_split(
        df[text_column],
        df[target_column],
        test_size=0.2,
        random_state=42
    )

    # Vectorize text
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Models to compare
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Naïve Bayes": MultinomialNB(),
        "Support Vector Machine": LinearSVC(max_iter=2000)
    }

    # Train and evaluate
    results = []
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)

        # Store results
        results.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'F1-Score': f1_score(y_test, y_pred, average='weighted')
        })

        # Show confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print(f"\n{name} Confusion Matrix:")
        print(cm)

    # Display comparison
    results_df = pd.DataFrame(results)
    print("\nMODEL COMPARISON:")
    display(results_df)

    return models, vectorizer

# ======================================
# 3. DEMO PREDICTIONS
# ======================================

def make_predictions(models, vectorizer, text_column):
    """Shows live predictions"""
    print("\nSTEP 3: TRY IT YOURSELF")

    samples = [
        "This movie was fantastic! The acting was brilliant.",
        "Terrible film with awful plot and bad acting.",
        "It was okay, nothing special."
    ]

    for name, model in models.items():
        print(f"\n{name} Predictions:")
        sample_vec = vectorizer.transform(samples)
        preds = model.predict(sample_vec)
        for text, pred in zip(samples, preds):
            print(f"\"{text[:30]}...\" → {'👍 Positive' if pred else '👎 Negative'}")

# ======================================
# MAIN EXECUTION
# ======================================

def main():
    print("🎬 IMDB SENTIMENT ANALYSIS TOOL")
    print("="*40)

    # 1. Upload data
    df, text_column, target_column = upload_and_clean_data()

    # 2. Model training
    models, vectorizer = train_models(df, text_column, target_column)

    # 3. Demo
    make_predictions(models, vectorizer, text_column)

    print("\n✅ Analysis complete!")

if __name__ == "__main__":
    main()

🎬 IMDB SENTIMENT ANALYSIS TOOL
STEP 1: UPLOAD YOUR DATASET
➡ Please upload a CSV/Excel file with your 6-column dataset


Saving analyzed_data.csv to analyzed_data (2).csv

Sample of your data:


Unnamed: 0,review,tokens,lemmatized,review_length,token_count,lemma_count
0,One of the other reviewers has mentioned that ...,"['one', 'reviewers', 'mentioned', 'watching', ...","['one', 'reviewer', 'mention', 'watch', 'oz', ...",1761,162,162
1,A wonderful little production. <br /><br />The...,"['wonderful', 'little', 'production', 'filming...","['wonderful', 'little', 'production', 'film', ...",998,86,86
2,I thought this was a wonderful way to spend ti...,"['thought', 'wonderful', 'way', 'spend', 'time...","['think', 'wonderful', 'way', 'spend', 'time',...",926,84,84



Available text columns:
1. review
2. lemmatized
Select which text column to use (1-2): 1

Using 'review' as text features and 'lemma_count' as target variable

STEP 2: MODEL TRAINING

Training Logistic Regression...

Logistic Regression Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Training Naïve Bayes...

Naïve Bayes Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Training Support Vector Machine...

Support Vector Machine Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

MODEL COMPARISON:


Unnamed: 0,Model,Accuracy,F1-Score
0,Logistic Regression,0.0167,0.008758
1,Naïve Bayes,0.0165,0.005654
2,Support Vector Machine,0.026,0.025165



STEP 3: TRY IT YOURSELF

Logistic Regression Predictions:
"This movie was fantastic! The ..." → 👍 Positive
"Terrible film with awful plot ..." → 👍 Positive
"It was okay, nothing special...." → 👍 Positive

Naïve Bayes Predictions:
"This movie was fantastic! The ..." → 👍 Positive
"Terrible film with awful plot ..." → 👍 Positive
"It was okay, nothing special...." → 👍 Positive

Support Vector Machine Predictions:
"This movie was fantastic! The ..." → 👍 Positive
"Terrible film with awful plot ..." → 👍 Positive
"It was okay, nothing special...." → 👍 Positive

✅ Analysis complete!
