In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

try:
    # --- Step 1: Load the Data ---
    # This is a new, working URL for the IMDB Movie Reviews dataset.
    url = "https://raw.githubusercontent.com/Ankit1598/IMDB-Dataset/master/IMDB%20Dataset.csv"
    df = pd.read_csv(url)

    # The label column in this dataset is named 'sentiment'
    # Map labels to numerical format: 1 for 'positive', 0 for 'negative'
    df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
    
    print("--- Step 1: Data Loaded Successfully ---")
    print("Dataset shape:", df.shape)
    print("Label distribution:")
    print(df['sentiment'].value_counts())
    print("\n")


    # --- Step 2 & 3: Split Data and Prepare for Feature Extraction ---
    X = df['review']      # The text reviews
    y = df['sentiment']   # The sentiment labels

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    print("--- Step 2 & 3: Data Split Complete ---")
    print(f"Training set size: {len(X_train)}")
    print(f"Testing set size: {len(X_test)}\n")


    # --- Step 4: Train the Model using a Pipeline ---
    text_clf = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', LinearSVC(max_iter=1000)), # Added max_iter to ensure convergence
    ])

    # Train the entire pipeline on the training data
    text_clf.fit(X_train, y_train)
    print("--- Step 4: Model Training Complete ---\n")


    # --- Step 5: Evaluate the Model ---
    predictions = text_clf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    print("--- Step 5: Model Evaluation ---")
    print(f"Model Accuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, predictions, target_names=['Negative', 'Positive']))

    # Generate and save the confusion matrix
    cm = confusion_matrix(y_test, predictions)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='viridis', 
                xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.savefig('sentiment_confusion_matrix.png')
    print("\nConfusion matrix saved as 'sentiment_confusion_matrix.png'")
    print("\n--- Analysis Complete ---")

except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: HTTP Error 404: Not Found
