# 📚 Setup Instructions## Running This Notebook### Option 1: Google Colab (Recommended for Grading)1. Upload this notebook to Google Colab2. Run all cells - the dataset will be automatically cloned from the repository3. No additional setup required!### Option 2: Local Environment1. Clone the repository:   ```bash   git clone https://github.com/AlfanArzaqi/SentimenAnalisiKaggle.git   cd SentimenAnalisiKaggle   ```2. Install dependencies:   ```bash   pip install -r requirements.txt   ```3. Run the notebook - dataset will be loaded from the `dataset/` folder## Dataset Information- **Source**: Twitter Entity Sentiment Analysis (Kaggle)- **Location**: `dataset/` folder in this repository- **Files**:  - `twitter_training.csv` - Main training dataset  - `twitter_validation.csv` - Test/validation dataset (if applicable)## Important Notes- ✅ No Kaggle API credentials required- ✅ Dataset included in repository- ✅ Works on Google Colab and local environments- ✅ Automatic environment detection---

In [None]:
"""
============================================================================
SENTIMENT ANALYSIS - TWITTER DATASET
============================================================================
Target: Training & Testing Accuracy > 92%

Models:
1. Optimized Logistic Regression + TF-IDF
2. BiLSTM + Attention Mechanism  
3. Multi-Filter CNN
============================================================================
"""

# Data Processing
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm

# NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Deep Learning
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Bidirectional, Dense, 
    Dropout, Conv1D, GlobalMaxPooling1D, Concatenate, Layer
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass

# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)

# Visualization settings
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")

In [None]:
# ============================================================================# SETUP ENVIRONMENT - AUTO-DETECT COLAB vs LOCAL# ============================================================================import osimport sys# Detect if running in Google Colabtry:    import google.colab    IN_COLAB = True    print("🔍 Environment: Google Colab")except ImportError:    IN_COLAB = False    print("🔍 Environment: Local")# Setup dataset pathif IN_COLAB:    print("\n📥 Setting up Google Colab environment...")        # Check if repository is already cloned    if not os.path.exists('/content/SentimenAnalisiKaggle'):        print("⏳ Cloning repository from GitHub...")        !git clone https://github.com/AlfanArzaqi/SentimenAnalisiKaggle.git        print("✅ Repository cloned successfully!")    else:        print("✅ Repository already exists")        # Set base path to cloned repository    BASE_PATH = '/content/SentimenAnalisiKaggle'    os.chdir(BASE_PATH)    print(f"📁 Working directory: {os.getcwd()}")    else:    # Local environment - assume dataset folder is in same directory    BASE_PATH = os.path.dirname(os.path.abspath('__file__')) if '__file__' in globals() else os.getcwd()    print(f"📁 Working directory: {BASE_PATH}")# Set dataset pathDATASET_PATH = os.path.join(BASE_PATH, 'dataset')# Verify dataset folder existsif os.path.exists(DATASET_PATH):    print(f"✅ Dataset folder found: {DATASET_PATH}")        # List available files    dataset_files = os.listdir(DATASET_PATH)    print(f"\n📄 Available dataset files:")    for file in dataset_files:        file_path = os.path.join(DATASET_PATH, file)        file_size = os.path.getsize(file_path) / (1024 * 1024)  # Convert to MB        print(f"   - {file} ({file_size:.2f} MB)")else:    print(f"⚠️  Dataset folder not found at: {DATASET_PATH}")    print("   Please ensure the 'dataset' folder exists with CSV files.")    # Set to current directory as fallback    DATASET_PATH = "."print("\n" + "="*70)print("✅ Setup complete! Ready to load data.")print("="*70)

In [None]:
# ============================================================================# LOAD DATA# ============================================================================print("📂 Loading dataset from repository...")# Construct the full path to the CSV filetrain_csv = os.path.join(DATASET_PATH, 'twitter_training.csv')# Check if file existsif not os.path.exists(train_csv):    print(f"⚠️  File not found at: {train_csv}")    print("    Searching for CSV files...")        # Try to find any CSV file with 'train' in the name    try:        csv_files = [f for f in os.listdir(DATASET_PATH) if f.endswith('.csv')]        train_files = [f for f in csv_files if 'train' in f.lower()]                if train_files:            train_csv = os.path.join(DATASET_PATH, train_files[0])            print(f"    ✅ Found: {train_files[0]}")        elif csv_files:            train_csv = os.path.join(DATASET_PATH, csv_files[0])            print(f"    ✅ Using: {csv_files[0]}")        else:            raise FileNotFoundError("No CSV files found in dataset folder")    except Exception as e:        print(f"    ❌ Error: {e}")        print("\n⚠️  Please ensure dataset files are in the 'dataset' folder:")        print("    - twitter_training.csv (required)")        print("    - twitter_validation.csv (optional)")        raise# Load datasetprint(f"\n⏳ Loading: {os.path.basename(train_csv)}")data = pd.read_csv(train_csv, header=None,                    names=['Tweet ID', 'entity', 'sentiment', 'Tweet content'])print(f"✅ Dataset loaded successfully!")print(f"   Source: {train_csv}")print(f"\nDataset shape: {data.shape}")print(f"\nFirst 5 rows:")display(data.head())print(f"\nDataset info:")data.info()

In [None]:
# ============================================================================
# DATA EXPLORATION
# ============================================================================

print("\n" + "="*80)
print("📊 DATA EXPLORATION")
print("="*80)

# Check missing values
print("\n=== MISSING VALUES ===")
print(data.isnull().sum())

# Original sentiment distribution
print("\n=== ORIGINAL SENTIMENT DISTRIBUTION ===")
print(data['sentiment'].value_counts())

# Visualize original distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Original distribution
data['sentiment'].value_counts().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Original Sentiment Distribution', fontsize=14, weight='bold')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# Drop unnecessary columns
data = data.drop(columns=['Tweet ID', 'entity'])

# Remove missing values
data = data.dropna()
print(f"\n✅ Data after removing NaN: {data.shape}")

# Merge Irrelevant to Neutral (4 → 3 classes)
data['sentiment'] = data['sentiment'].replace('Irrelevant', 'Neutral')

print("\n=== MERGED SENTIMENT DISTRIBUTION (3 CLASSES) ===")
print(data['sentiment'].value_counts())

# Visualize merged distribution
data['sentiment'].value_counts().plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Sentiment Distribution (3 Classes)', fontsize=14, weight='bold')
axes[1].set_xlabel('Sentiment')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# DATA PREPROCESSING
# ============================================================================

print("\n" + "="*80)
print("🔧 DATA PREPROCESSING")
print("="*80)

# Define preprocessing functions
def lowercase(text):
    """Convert to lowercase"""
    return text.lower()

def remove_unnecessary_char(text):
    """Remove URLs, mentions, retweets, special characters"""
    text = re.sub(r'pic\.twitter\.com\.[^\s]+', '', text)
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\brt\b', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'@[^\s]+', ' ', text)
    text = re.sub(r'(.)\1\1+', r'\1\1', text)
    text = re.sub(r'[^\x00-\xe2]+', ' ', text)
    text = re.sub(r':', '', text)
    text = re.sub(r'‚Ä¶', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def remove_nonalphanumeric(text):
    """Remove non-alphanumeric characters"""
    text = re.sub(r'[^0-9a-zA-Z]+', ' ', text)
    text = re.sub(r'00', ' ', text)
    return text.strip()

def tokenize(text):
    """Tokenize text"""
    return word_tokenize(text)

def remove_stopwords(tokens):
    """Remove English stopwords"""
    english_stopwords = set(stopwords.words('english'))
    return [t for t in tokens if t not in english_stopwords]

def stemming(text):
    """Apply stemming"""
    snowball = SnowballStemmer(language='english')
    return snowball.stem(text)

def preprocess(text):
    """Complete preprocessing pipeline"""
    if pd.isna(text) or text == '':
        return None
    
    text = lowercase(text)
    text = remove_unnecessary_char(text)
    text = remove_nonalphanumeric(text)
    text = stemming(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    
    return ' '.join(tokens) if tokens else None

# Apply preprocessing
print("\n⏳ Preprocessing texts...")
tqdm.pandas(desc="Processing")
data['cleaned_text'] = data['Tweet content'].progress_apply(preprocess)

# Remove empty texts
data = data.dropna(subset=['cleaned_text'])
data = data[data['cleaned_text'] != '']

print(f"\n✅ Preprocessing complete! Final shape: {data.shape}")

# Show examples
print("\n=== PREPROCESSING EXAMPLES ===")
for i in range(3):
    print(f"\n{i+1}. Original: {data.iloc[i]['Tweet content'][:100]}...")
    print(f"   Cleaned:  {data.iloc[i]['cleaned_text'][:100]}...")

In [None]:
# ============================================================================
# PREPARE DATA FOR MODELING
# ============================================================================

print("\n" + "="*80)
print("🎯 PREPARE DATA FOR MODELING")
print("="*80)

# Extract features and labels
X = data['cleaned_text'].values
y = data['sentiment'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\n=== LABEL ENCODING ===")
print(f"Classes: {label_encoder.classes_}")
print(f"Encoded: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
print(f"\nTotal samples: {len(X)}")
print(f"Class distribution:")
for cls in label_encoder.classes_:
    count = (y == cls).sum()
    print(f"  {cls}: {count} ({count/len(y)*100:.1f}%)")

In [None]:
# ============================================================================
# MODEL 1: LOGISTIC REGRESSION + TF-IDF
# ============================================================================

print("\n" + "="*80)
print("🤖 MODEL 1: LOGISTIC REGRESSION + TF-IDF")
print("="*80)

# Split data (80/20)
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)
print(f"\nData split: {len(X_train_lr)} train, {len(X_test_lr)} test")

# TF-IDF Vectorization
print("\n⏳ Creating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents='unicode',
    lowercase=True,
    token_pattern=r'\w{2,}',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_lr)
X_test_tfidf = tfidf_vectorizer.transform(X_test_lr)
print(f"✅ TF-IDF shape: {X_train_tfidf.shape}")

# Train Logistic Regression
print("\n⏳ Training Logistic Regression...")
lr_model = LogisticRegression(
    C=1.0,
    penalty='l2',
    solver='saga',
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

lr_model.fit(X_train_tfidf, y_train_lr)

# Predictions
y_train_pred_lr = lr_model.predict(X_train_tfidf)
y_test_pred_lr = lr_model.predict(X_test_tfidf)

# Evaluate
train_acc_lr = accuracy_score(y_train_lr, y_train_pred_lr)
test_acc_lr = accuracy_score(y_test_lr, y_test_pred_lr)

print(f"\n{'='*50}")
print(f"📊 RESULTS")
print(f"{'='*50}")
print(f"Training Accuracy:   {train_acc_lr*100:.2f}%")
print(f"Testing Accuracy:    {test_acc_lr*100:.2f}%")
print(f"{'='*50}")

print("\n=== CLASSIFICATION REPORT ===")
print(classification_report(y_test_lr, y_test_pred_lr, 
                          target_names=label_encoder.classes_))

# Confusion Matrix
cm_lr = confusion_matrix(y_test_lr, y_test_pred_lr)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - Logistic Regression', fontsize=14, weight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print("\n✅ Model 1 training complete!")

In [None]:
# ============================================================================
# MODEL 2: BiLSTM + ATTENTION
# ============================================================================

print("\n" + "="*80)
print("🤖 MODEL 2: BiLSTM + ATTENTION MECHANISM")
print("="*80)

# Split data (80/20)
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# Tokenization
print("\n⏳ Tokenizing texts...")
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_dl)

X_train_seq = tokenizer.texts_to_sequences(X_train_dl)
X_test_seq = tokenizer.texts_to_sequences(X_test_dl)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

print(f"✅ Sequences shape: {X_train_pad.shape}")

# One-hot encode
num_classes = len(label_encoder.classes_)
y_train_cat = tf.keras.utils.to_categorical(y_train_dl, num_classes)
y_test_cat = tf.keras.utils.to_categorical(y_test_dl, num_classes)

# Define Attention Layer
class AttentionLayer(Layer):
    """Bahdanau Attention"""
    
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.W = self.add_weight(
            name='attention_weight',
            shape=(input_shape[-1], input_shape[-1]),
            initializer='glorot_uniform',
            trainable=True
        )
        self.b = self.add_weight(
            name='attention_bias',
            shape=(input_shape[-1],),
            initializer='zeros',
            trainable=True
        )
        super(AttentionLayer, self).build(input_shape)
    
    def call(self, inputs):
        score = tf.nn.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

# Build model
print("\n⏳ Building BiLSTM + Attention model...")

input_layer = Input(shape=(max_len,))
embedding = Embedding(max_words, 128, input_length=max_len)(input_layer)
bilstm = Bidirectional(LSTM(64, return_sequences=True, dropout=0.3))(embedding)
attention = AttentionLayer()(bilstm)
dense1 = Dense(128, activation='relu')(attention)
dropout1 = Dropout(0.5)(dense1)
dense2 = Dense(64, activation='relu')(dropout1)
dropout2 = Dropout(0.3)(dense2)
output = Dense(num_classes, activation='softmax')(dropout2)

bilstm_model = Model(inputs=input_layer, outputs=output)
bilstm_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print(bilstm_model.summary())

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7)

# Train
print("\n⏳ Training BiLSTM + Attention...")
history_bilstm = bilstm_model.fit(
    X_train_pad, y_train_cat,
    validation_split=0.1,
    epochs=30,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# Evaluate
train_loss_bilstm, train_acc_bilstm = bilstm_model.evaluate(X_train_pad, y_train_cat, verbose=0)
test_loss_bilstm, test_acc_bilstm = bilstm_model.evaluate(X_test_pad, y_test_cat, verbose=0)

print(f"\n{'='*50}")
print(f"📊 RESULTS")
print(f"{'='*50}")
print(f"Training Accuracy:   {train_acc_bilstm*100:.2f}%")
print(f"Testing Accuracy:    {test_acc_bilstm*100:.2f}%")
print(f"{'='*50}")

# Plot history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(history_bilstm.history['accuracy'], label='Train', linewidth=2)
axes[0].plot(history_bilstm.history['val_accuracy'], label='Validation', linewidth=2)
axes[0].set_title('BiLSTM + Attention: Accuracy', fontsize=14, weight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(history_bilstm.history['loss'], label='Train', linewidth=2)
axes[1].plot(history_bilstm.history['val_loss'], label='Validation', linewidth=2)
axes[1].set_title('BiLSTM + Attention: Loss', fontsize=14, weight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Confusion Matrix
y_test_pred_bilstm = bilstm_model.predict(X_test_pad, verbose=0)
y_test_pred_bilstm = np.argmax(y_test_pred_bilstm, axis=1)

cm_bilstm = confusion_matrix(y_test_dl, y_test_pred_bilstm)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_bilstm, annot=True, fmt='d', cmap='Greens',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - BiLSTM + Attention', fontsize=14, weight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print("\n✅ Model 2 training complete!")

In [None]:
# ============================================================================
# MODEL 3: MULTI-FILTER CNN
# ============================================================================

print("\n" + "="*80)
print("🤖 MODEL 3: MULTI-FILTER CNN")
print("="*80)

# Split data (70/30)
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(
    X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)
print(f"\nData split: {len(X_train_cnn)} train, {len(X_test_cnn)} test")

# Tokenization (reuse tokenizer)
X_train_cnn_seq = tokenizer.texts_to_sequences(X_train_cnn)
X_test_cnn_seq = tokenizer.texts_to_sequences(X_test_cnn)

X_train_cnn_pad = pad_sequences(X_train_cnn_seq, maxlen=max_len, padding='post')
X_test_cnn_pad = pad_sequences(X_test_cnn_seq, maxlen=max_len, padding='post')

y_train_cnn_cat = tf.keras.utils.to_categorical(y_train_cnn, num_classes)
y_test_cnn_cat = tf.keras.utils.to_categorical(y_test_cnn, num_classes)

# Build model
print("\n⏳ Building Multi-Filter CNN...")

input_layer = Input(shape=(max_len,))
embedding = Embedding(max_words, 128, input_length=max_len)(input_layer)

# Multiple filter sizes
filter_sizes = [2, 3, 4, 5]
conv_layers = []

for filter_size in filter_sizes:
    conv = Conv1D(128, kernel_size=filter_size, activation='relu')(embedding)
    pool = GlobalMaxPooling1D()(conv)
    conv_layers.append(pool)

concat = Concatenate()(conv_layers)
dense1 = Dense(256, activation='relu')(concat)
dropout1 = Dropout(0.5)(dense1)
dense2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.3)(dense2)
output = Dense(num_classes, activation='softmax')(dropout2)

cnn_model = Model(inputs=input_layer, outputs=output)
cnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print(cnn_model.summary())

# Train
print("\n⏳ Training Multi-Filter CNN...")
history_cnn = cnn_model.fit(
    X_train_cnn_pad, y_train_cnn_cat,
    validation_split=0.1,
    epochs=30,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# Evaluate
train_loss_cnn, train_acc_cnn = cnn_model.evaluate(X_train_cnn_pad, y_train_cnn_cat, verbose=0)
test_loss_cnn, test_acc_cnn = cnn_model.evaluate(X_test_cnn_pad, y_test_cnn_cat, verbose=0)

print(f"\n{'='*50}")
print(f"📊 RESULTS")
print(f"{'='*50}")
print(f"Training Accuracy:   {train_acc_cnn*100:.2f}%")
print(f"Testing Accuracy:    {test_acc_cnn*100:.2f}%")
print(f"{'='*50}")

# Plot history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(history_cnn.history['accuracy'], label='Train', linewidth=2)
axes[0].plot(history_cnn.history['val_accuracy'], label='Validation', linewidth=2)
axes[0].set_title('Multi-Filter CNN: Accuracy', fontsize=14, weight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(history_cnn.history['loss'], label='Train', linewidth=2)
axes[1].plot(history_cnn.history['val_loss'], label='Validation', linewidth=2)
axes[1].set_title('Multi-Filter CNN: Loss', fontsize=14, weight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Confusion Matrix
y_test_pred_cnn = cnn_model.predict(X_test_cnn_pad, verbose=0)
y_test_pred_cnn = np.argmax(y_test_pred_cnn, axis=1)

cm_cnn = confusion_matrix(y_test_cnn, y_test_pred_cnn)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_cnn, annot=True, fmt='d', cmap='Oranges',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - Multi-Filter CNN', fontsize=14, weight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print("\n✅ Model 3 training complete!")

In [None]:
# ============================================================================
# MODEL COMPARISON
# ============================================================================

print("\n" + "="*80)
print("📊 FINAL MODEL COMPARISON")
print("="*80)

# Create comparison dataframe
comparison = pd.DataFrame({
    'Model': [
        'Logistic Regression + TF-IDF',
        'BiLSTM + Attention',
        'Multi-Filter CNN'
    ],
    'Train Accuracy (%)': [
        train_acc_lr * 100,
        train_acc_bilstm * 100,
        train_acc_cnn * 100
    ],
    'Test Accuracy (%)': [
        test_acc_lr * 100,
        test_acc_bilstm * 100,
        test_acc_cnn * 100
    ],
    'Data Split': ['80/20', '80/20', '70/30'],
    'Feature Extraction': ['TF-IDF (1-2gram)', 'Trainable Embedding', 'Trainable Embedding']
})

print("\n=== MODEL COMPARISON TABLE ===")
display(comparison)

# Visualize comparison
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(comparison))
width = 0.35

bars1 = ax.bar(x - width/2, comparison['Train Accuracy (%)'], 
               width, label='Train Accuracy', color='skyblue')
bars2 = ax.bar(x + width/2, comparison['Test Accuracy (%)'], 
               width, label='Test Accuracy', color='coral')

ax.set_xlabel('Model', fontsize=12, weight='bold')
ax.set_ylabel('Accuracy (%)', fontsize=12, weight='bold')
ax.set_title('Model Comparison: Train vs Test Accuracy', fontsize=14, weight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison['Model'], rotation=15, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%',
                ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

# Check target achievement
print("\n" + "="*80)
print("🎯 TARGET ACHIEVEMENT CHECK (>92%)")
print("="*80)

target_threshold = 92.0
models_above_92 = []

for idx, row in comparison.iterrows():
    model_name = row['Model']
    train_acc = row['Train Accuracy (%)']
    test_acc = row['Test Accuracy (%)']
    
    if train_acc >= target_threshold and test_acc >= target_threshold:
        models_above_92.append(model_name)
        status = "✅ ACHIEVED"
    else:
        status = "❌ NOT ACHIEVED"
    
    print(f"\n{model_name}:")
    print(f"  Train: {train_acc:.2f}% | Test: {test_acc:.2f}% | {status}")

print("\n" + "="*80)
if models_above_92:
    print(f"✅ {len(models_above_92)} model(s) achieved >92% target!")
    for model in models_above_92:
        print(f"   - {model}")
else:
    print("⚠️  None of the models achieved >92% target yet.")
    print("   Consider: more epochs, hyperparameter tuning, or data augmentation")

In [None]:
# ============================================================================
# INFERENCE / TESTING
# ============================================================================

print("\n" + "="*80)
print("🔮 INFERENCE / TESTING EXAMPLES")
print("="*80)

# Test samples
test_samples = [
    "I absolutely love this product! Best purchase ever!",
    "This is terrible. Worst experience of my life.",
    "It's okay, nothing special really.",
    "Amazing quality and fast delivery! Highly recommend!",
    "Very disappointed. Would not buy again."
]

print("\n=== TESTING NEW SAMPLES ===\n")

for i, sample in enumerate(test_samples, 1):
    print(f"{'='*70}")
    print(f"Sample {i}: {sample}")
    
    # Preprocess
    cleaned = preprocess(sample)
    if cleaned is None or cleaned == '':
        print("  ⚠️  Preprocessing failed")
        continue
    
    print(f"Cleaned: {cleaned}")
    print(f"\n📊 Predictions:")
    
    # Model 1: Logistic Regression
    sample_tfidf = tfidf_vectorizer.transform([cleaned])
    pred_lr = lr_model.predict(sample_tfidf)[0]
    pred_lr_proba = lr_model.predict_proba(sample_tfidf)[0]
    
    print(f"  LR:     {label_encoder.classes_[pred_lr]:12} (confidence: {pred_lr_proba[pred_lr]:.3f})")
    
    # Model 2: BiLSTM
    sample_seq = tokenizer.texts_to_sequences([cleaned])
    sample_pad = pad_sequences(sample_seq, maxlen=max_len, padding='post')
    pred_bilstm = bilstm_model.predict(sample_pad, verbose=0)
    pred_bilstm_class = np.argmax(pred_bilstm[0])
    
    print(f"  BiLSTM: {label_encoder.classes_[pred_bilstm_class]:12} (confidence: {pred_bilstm[0][pred_bilstm_class]:.3f})")
    
    # Model 3: CNN
    pred_cnn = cnn_model.predict(sample_pad, verbose=0)
    pred_cnn_class = np.argmax(pred_cnn[0])
    
    print(f"  CNN:    {label_encoder.classes_[pred_cnn_class]:12} (confidence: {pred_cnn[0][pred_cnn_class]:.3f})")
    print()

print("="*70)
print("\n✅ Inference complete!")

In [None]:
# ============================================================================
# SAVE MODELS
# ============================================================================

print("\n" + "="*80)
print("💾 SAVING MODELS")
print("="*80)

import pickle
import json

# Save Logistic Regression
with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)
print("✅ Saved: logistic_regression_model.pkl")

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print("✅ Saved: tfidf_vectorizer.pkl")

# Save Deep Learning models
bilstm_model.save('bilstm_attention_model.h5')
print("✅ Saved: bilstm_attention_model.h5")

cnn_model.save('multi_filter_cnn_model.h5')
print("✅ Saved: multi_filter_cnn_model.h5")

# Save tokenizer
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w') as f:
    json.dump(tokenizer_json, f)
print("✅ Saved: tokenizer.json")

# Save label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
print("✅ Saved: label_encoder.pkl")

print("\n" + "="*80)
print("✅ ALL MODELS SAVED SUCCESSFULLY!")
print("="*80)

print("\n📊 Final Summary:")
print(f"   Dataset: {len(data)} samples")
print(f"   Classes: {len(label_encoder.classes_)} ({', '.join(label_encoder.classes_)})")
print(f"   Models trained: 3")
print(f"   Models achieving >92%: {len(models_above_92)}")

print("\n🎉 SENTIMENT ANALYSIS PROJECT COMPLETE!")