In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin



In [2]:
df = pd.read_csv("IMDB Dataset.csv")

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})  # Binary encoding for labels
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42
)

In [4]:
max_words = 20000
max_len = 300

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train),maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test),maxlen=max_len)

In [6]:
# Prepare TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=20000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
from sklearn.svm import SVC

# Train an SVM classifier
svm_model = SVC(probability=True)  # Use probability=True for stacking
svm_model.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

In [None]:
## Training LSTMs

embedding_dim = 50

# Build LSTM Model

lstm_model = Sequential([
    Embedding(input_dim=max_words,output_dim=embedding_dim,input_length=max_len),
    LSTM(128,dropout=0.2,recurrent_dropout=0.2),
    Dense(1,activation='sigmoid')
])

lstm_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy']
lstm_model.fit(X_train_seq,y_train,epochs=3,batch_size=64,verbose=1)


In [None]:
## Wrapper for LSTM Prediction

class LSTMWrapper(BaseEstimator,TransformerMixin):
    def fit(self,x,y):
        return self
    def transform(self,x):
        x_seq = pad_sequences(tokenizer.texts_to_sequences(X),maxlen=max_len)
        return lstm_model.predict(x_seq)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score

# Wrapper for LSTM predictions
class LSTMWrapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        return self
    def transform(self, X):
        X_seq = pad_sequences(tokenizer.texts_to_sequences(X), maxlen=max_len)
        return lstm_model.predict(X_seq)

# Combine models in StackingClassifier
stacked_model = StackingClassifier(
    estimators=[
        ('log_reg', LogisticRegression().fit(X_train_bow, y_train)), 
        ('svm', svm_model),
        ('rf', rf_model),
        ('lstm', LSTMWrapper())
    ],
    final_estimator=LogisticRegression()
)

# Train StackingClassifier
stacked_model.fit(X_train, y_train)

# Evaluate
y_pred = stacked_model.predict(X_test)
print("Ensemble Accuracy:", accuracy_score(y_test, y_pred))


In [None]:
from sklearn.metrics import classification_report

# Evaluate individual models
log_reg_pred = log_reg.predict(X_test_bow)
svm_pred = svm_model.predict(X_test_tfidf)
rf_pred = rf_model.predict(X_test_tfidf)
lstm_pred = (lstm_model.predict(X_test_seq) > 0.5).astype(int)

print("Logistic Regression Report:")
print(classification_report(y_test, log_reg_pred))

print("SVM Report:")
print(classification_report(y_test, svm_pred))

print("Random Forest Report:")
print(classification_report(y_test, rf_pred))

print("LSTM Report:")
print(classification_report(y_test, lstm_pred))

print("Stacking Ensemble Report:")
print(classification_report(y_test, y_pred))
