In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

# Text Processing Libraries
try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    NLTK_AVAILABLE = True
except ImportError:
    print("NLTK not available. Using basic preprocessing.")
    NLTK_AVAILABLE = False

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.pipeline import Pipeline

class TextPreprocessorBasic:
    """Basic text preprocessing without external dependencies"""

    def __init__(self):
        self.stop_words = {
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
            'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
            'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
            'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
            'while', 'of', 'at', 'by', 'for', 'with', 'through', 'during', 'before', 'after',
            'above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
            'further', 'then', 'once'
        }

    def clean_text(self, text):
        """Clean text by removing HTML, special characters, etc."""
        text = text.lower()
        text = re.sub('<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)  # Keep only letters and spaces
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        return text.strip()

    def tokenize_and_process(self, text, remove_stopwords=True):
        """Basic tokenization and processing"""
        text = self.clean_text(text)
        tokens = text.split()

        if remove_stopwords:
            tokens = [token for token in tokens if token not in self.stop_words]

        tokens = [token for token in tokens if len(token) > 2]
        return ' '.join(tokens)

class SentimentAnalyzer:
    """Complete sentiment analysis pipeline"""

    def __init__(self):
        self.preprocessor = TextPreprocessorBasic()
        self.vectorizer = None
        self.model = None
        self.pipeline = None
        self.model_name = None
        self.feature_names = None

    def create_pipeline(self, vectorizer_type='tfidf', model_type='logistic'):
        """Create a scikit-learn pipeline"""

        # Choose vectorizer
        if vectorizer_type == 'tfidf':
            vectorizer = TfidfVectorizer(
                max_features=5000,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.95,
                stop_words='english'
            )
        elif vectorizer_type == 'count':
            vectorizer = CountVectorizer(
                max_features=5000,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.95,
                stop_words='english'
            )

        # Choose model
        if model_type == 'logistic':
            model = LogisticRegression(random_state=42, max_iter=1000)
        elif model_type == 'svm':
            model = SVC(kernel='linear', random_state=42, probability=True)
        elif model_type == 'random_forest':
            model = RandomForestClassifier(n_estimators=100, random_state=42)
        elif model_type == 'naive_bayes':
            model = MultinomialNB()

        # Create pipeline
        self.pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', model)
        ])

        self.model_name = f"{model_type.title()} with {vectorizer_type.upper()}"
        return self.pipeline

    def train(self, X_train, y_train):
        """Train the pipeline"""
        # Preprocess training data
        X_train_processed = X_train.apply(self.preprocessor.tokenize_and_process)

        # Train pipeline
        self.pipeline.fit(X_train_processed, y_train)

        # Store references
        self.vectorizer = self.pipeline.named_steps['vectorizer']
        self.model = self.pipeline.named_steps['classifier']
        self.feature_names = self.vectorizer.get_feature_names_out()

        print(f"Training completed for {self.model_name}")

    def predict(self, X_test):
        """Make predictions"""
        X_test_processed = X_test.apply(self.preprocessor.tokenize_and_process)

        predictions = self.pipeline.predict(X_test_processed)
        probabilities = self.pipeline.predict_proba(X_test_processed)

        return predictions, probabilities

    def predict_single(self, text):
        """Predict sentiment for a single text"""
        processed_text = self.preprocessor.tokenize_and_process(text)
        prediction = self.pipeline.predict([processed_text])[0]
        probability = self.pipeline.predict_proba([processed_text])[0]

        sentiment = "Positive" if prediction == 1 else "Negative"
        confidence = max(probability)

        return {
            'text': text,
            'processed_text': processed_text,
            'sentiment': sentiment,
            'prediction': int(prediction),
            'confidence': confidence,
            'probabilities': {
                'negative': probability[0],
                'positive': probability[1]
            }
        }

    def get_feature_importance(self, top_n=10):
        """Get feature importance for interpretation"""
        if hasattr(self.model, 'coef_'):
            # For linear models
            coefficients = self.model.coef_[0]
            feature_importance = list(zip(self.feature_names, coefficients))
            feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)

            return {
                'most_positive': [(feat, coef) for feat, coef in feature_importance if coef > 0][:top_n],
                'most_negative': [(feat, coef) for feat, coef in feature_importance if coef < 0][:top_n]
            }

        elif hasattr(self.model, 'feature_importances_'):
            # For tree-based models
            importances = self.model.feature_importances_
            feature_importance = list(zip(self.feature_names, importances))
            feature_importance.sort(key=lambda x: x[1], reverse=True)

            return {'most_important': feature_importance[:top_n]}

        return None

    def evaluate_model(self, X_test, y_test):
        """Comprehensive model evaluation"""
        predictions, probabilities = self.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, predictions)
        cm = confusion_matrix(y_test, predictions)

        # Detailed metrics
        tn, fp, fn, tp = cm.ravel()
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score,
            'specificity': specificity,
            'confusion_matrix': cm,
            'classification_report': classification_report(y_test, predictions, target_names=['Negative', 'Positive'])
        }

def load_data(file_path=None):
    """Load IMDB dataset"""
    if file_path:
        # Load from file
        data = pd.read_csv(file_path)
        print(f"Loaded {len(data)} reviews from {file_path}")
    else:
        # Create sample dataset for demonstration
        print("Creating sample dataset for demonstration...")

        # Expanded sample dataset
        positive_words = ["excellent", "amazing", "brilliant", "outstanding", "fantastic", "superb", "wonderful", "incredible", "perfect", "magnificent"]
        negative_words = ["terrible", "awful", "horrible", "dreadful", "disappointing", "boring", "worst", "bad", "poor", "waste"]

        reviews = []

        # Create 1000 positive reviews
        for i in range(1000):
            review = f"This movie is {np.random.choice(positive_words)} with {np.random.choice(positive_words)} acting and {np.random.choice(positive_words)} storyline."
            reviews.append((review, "positive"))

        # Create 1000 negative reviews  
        for i in range(1000):
            review = f"This movie is {np.random.choice(negative_words)} with {np.random.choice(negative_words)} acting and {np.random.choice(negative_words)} plot."
            reviews.append((review, "negative"))

        data = pd.DataFrame(reviews, columns=['review', 'sentiment'])
        print(f"Created sample dataset with {len(data)} reviews")

    return data

def compare_models(X_train, X_test, y_train, y_test):
    """Compare multiple models and return the best one"""

    models_to_test = [
        ('tfidf', 'logistic'),
        ('tfidf', 'svm'),
        ('tfidf', 'random_forest'),
        ('count', 'naive_bayes')
    ]

    results = {}

    print("Comparing different models...")
    print("-" * 50)

    for vectorizer_type, model_type in models_to_test:
        analyzer = SentimentAnalyzer()
        analyzer.create_pipeline(vectorizer_type, model_type)
        analyzer.train(X_train, y_train)

        evaluation = analyzer.evaluate_model(X_test, y_test)
        results[analyzer.model_name] = {
            'analyzer': analyzer,
            'evaluation': evaluation
        }

        print(f"{analyzer.model_name:<25} Accuracy: {evaluation['accuracy']:.4f}")

    # Find best model
    best_model_name = max(results.keys(), key=lambda x: results[x]['evaluation']['accuracy'])

    print(f"\nBest model: {best_model_name}")
    return results[best_model_name]['analyzer'], results

def main():
    """Main function to run the complete sentiment analysis pipeline"""

    print("=" * 60)
    print("MOVIE SENTIMENT ANALYSIS PROJECT")
    print("=" * 60)

    # Load data
    data = load_data()  # Creates sample data

    # Prepare data
    X = data['review']
    y = (data['sentiment'] == 'positive').astype(int)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"\nTrain set: {len(X_train)} reviews")
    print(f"Test set: {len(X_test)} reviews")

    # Compare models and get the best one
    best_analyzer, all_results = compare_models(X_train, X_test, y_train, y_test)

    # Detailed evaluation of best model
    print("\n" + "=" * 60)
    print("DETAILED EVALUATION OF BEST MODEL")
    print("=" * 60)

    best_evaluation = best_analyzer.evaluate_model(X_test, y_test)

    print(f"Accuracy: {best_evaluation['accuracy']:.4f}")
    print(f"Precision: {best_evaluation['precision']:.4f}")
    print(f"Recall: {best_evaluation['recall']:.4f}")
    print(f"F1-Score: {best_evaluation['f1_score']:.4f}")

    print("\nClassification Report:")
    print(best_evaluation['classification_report'])

    # Feature importance
    importance = best_analyzer.get_feature_importance(top_n=10)
    if importance:
        if 'most_positive' in importance:
            print("\nMost Positive Features:")
            for i, (feature, coef) in enumerate(importance['most_positive'], 1):
                print(f"{i:2d}. {feature:20s} ({coef:6.3f})")

            print("\nMost Negative Features:")
            for i, (feature, coef) in enumerate(importance['most_negative'], 1):
                print(f"{i:2d}. {feature:20s} ({coef:6.3f})")

    # Test on new reviews
    print("\n" + "=" * 60)
    print("TESTING ON NEW REVIEWS")
    print("=" * 60)

    test_reviews = [
        "This movie was absolutely amazing! Great story and fantastic acting.",
        "Terrible film, waste of time. Very boring and poorly made.",
        "Outstanding cinematography and brilliant performances. Highly recommended!",
        "Worst movie ever! Complete garbage and total disappointment."
    ]

    for i, review in enumerate(test_reviews, 1):
        result = best_analyzer.predict_single(review)
        print(f"\nReview {i}: {review}")
        print(f"Prediction: {result['sentiment']} (Confidence: {result['confidence']:.3f})")

    print("\n" + "=" * 60)
    print("ANALYSIS COMPLETE")
    print("=" * 60)

    return best_analyzer

# Usage example
if __name__ == "__main__":
    # Run the complete analysis
    best_model = main()

    # The trained model is now ready for use
    print("\nModel ready for predictions!")
    print("Use: best_model.predict_single('your review text')")


MOVIE SENTIMENT ANALYSIS PROJECT
Creating sample dataset for demonstration...
Created sample dataset with 2000 reviews

Train set: 1600 reviews
Test set: 400 reviews
Comparing different models...
--------------------------------------------------
Training completed for Logistic with TFIDF
Logistic with TFIDF       Accuracy: 1.0000
Training completed for Svm with TFIDF
Svm with TFIDF            Accuracy: 1.0000
Training completed for Random_Forest with TFIDF
Random_Forest with TFIDF  Accuracy: 1.0000
Training completed for Naive_Bayes with COUNT
Naive_Bayes with COUNT    Accuracy: 1.0000

Best model: Logistic with TFIDF

DETAILED EVALUATION OF BEST MODEL
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000

Classification Report:
              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00       200
    Positive       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      