# **IMDB**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences



max_features = 5000
maxlen = 500
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)




# Decode function to convert sequences back to text
word_index = imdb.get_word_index()
index_word = {v + 3: k for k, v in word_index.items()}
index_word[0] = '<PAD>'
index_word[1] = '<START>'
index_word[2] = '<UNK>'


X_train_text = [' '.join(index_word.get(i, '?') for i in seq) for seq in X_train]
X_test_text = [' '.join(index_word.get(i, '?') for i in seq) for seq in X_test]



vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train_text)
X_test_vec = vectorizer.transform(X_test_text)
print(X_train_vec[0])
print(X_test_vec[0])



log_reg = LogisticRegression(max_iter=200)
log_reg = LogisticRegression()
log_reg.fit(X_train_vec, y_train)
y_pred_log_reg = log_reg.predict(X_test_vec)
print(classification_report(y_test, y_pred_log_reg, target_names=["Negative", "Positive"]))



naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_vec, y_train)
y_pred_nb = naive_bayes.predict(X_test_vec)
print(classification_report(y_test, y_pred_nb, target_names=["Negative", "Positive"]))



svm = LinearSVC()
svm.fit(X_train_vec, y_train)
y_pred_svm = svm.predict(X_test_vec)
print(classification_report(y_test, y_pred_svm, target_names=["Negative", "Positive"]))

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 112 stored elements and shape (1, 4898)>
  Coords	Values
  (0, 4081)	0.018161574791954474
  (0, 4372)	0.05989429002682193
  (0, 1688)	0.17260869183197405
  (0, 4714)	0.2868937029805583
  (0, 2393)	0.13532479702216973
  (0, 578)	0.2283618610749408
  (0, 703)	0.08695901899640085
  (0, 2586)	0.09754452672825938
  (0, 3746)	0.09391048358927476
  (0, 4131)	0.07964733966197848
  (0, 1247)	0.0721956312972466
  (0, 1520)	0.12936111313919155
  (0, 3496)	0.07943724066926205
  (0, 4208)	0.11654538443742532
  (0, 4346)	0.2746885888265036
  (0, 3135)	0.05562279686636129
  (0, 4361)	0.13333473515845054
  (0, 3238)	0.12596764917385855
  (0, 223)	0.16901333435097304
  (0, 4888)	0.11680196561648677
  (0, 999)	0.04418418996113909
  (0, 2196)	0.08328246387682232
  (0, 444)	0.0945671688358273
  (0, 4358)	0.06679152646353043
  (0, 3649)	0.08132183922770295
  :	:
  (0, 3084)	0.03372306878332425
  (0, 2567)	0.08853344945729978
  (0, 4367)	0.091707

'\nlog_reg = LogisticRegression(max_iter=200)\nlog_reg = LogisticRegression()\nlog_reg.fit(X_train_vec, y_train)\ny_pred_log_reg = log_reg.predict(X_test_vec)\nprint(classification_report(y_test, y_pred_log_reg, target_names=["Negative", "Positive"]))\n\n\n\nnaive_bayes = MultinomialNB()\nnaive_bayes.fit(X_train_vec, y_train)\ny_pred_nb = naive_bayes.predict(X_test_vec)\nprint(classification_report(y_test, y_pred_nb, target_names=["Negative", "Positive"]))\n\n\n\nsvm = LinearSVC()\nsvm.fit(X_train_vec, y_train)\ny_pred_svm = svm.predict(X_test_vec)\nprint(classification_report(y_test, y_pred_svm, target_names=["Negative", "Positive"]))\n'

# **CNN**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Parameters from your paper
vocab_size = 5000
embedding_dim = 32
max_sequence_length = 500
kernel_sizes = [3, 5, 7, 9]  # Multiple branches
num_filters = 128
pool_size = 2
lstm_units = 128
dropout_rate = 0.5

class MultiBranchCNN_LSTM(nn.Module):
    def __init__(self):
        super(MultiBranchCNN_LSTM, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Create branches for each kernel size
        self.branches = nn.ModuleList()
        for k in kernel_sizes:
            branch = nn.ModuleDict({
                'conv': nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=k, padding='same'),
                'pool': nn.MaxPool1d(kernel_size=pool_size),
                'dropout': nn.Dropout(dropout_rate),
                'batch_norm': nn.BatchNorm1d(num_filters),
                'lstm': nn.LSTM(input_size=num_filters, hidden_size=lstm_units, batch_first=True)
            })
            self.branches.append(branch)

        # Final dense layer
        self.fc = nn.Linear(lstm_units * len(kernel_sizes), 1)

    def forward(self, x):
        # x: [batch_size, seq_len]
        x = self.embedding(x)                   # [batch_size, seq_len, embedding_dim]
        x = x.permute(0, 2, 1)                 # [batch_size, embedding_dim, seq_len] for Conv1d

        branch_outputs = []
        for branch in self.branches:
            out = F.relu(branch['conv'](x))      # Conv + ReLU
            out = branch['pool'](out)            # MaxPooling
            out = branch['dropout'](out)         # Dropout
            out = branch['batch_norm'](out)      # BatchNorm

            out = out.permute(0, 2, 1)           # [batch_size, seq_len//pool, num_filters] for LSTM
            out, (h_n, c_n) = branch['lstm'](out)
            out = out[:, -1, :]                  # Take last timestep
            branch_outputs.append(out)

        # Concatenate all branch outputs
        out = torch.cat(branch_outputs, dim=1)   # [batch_size, lstm_units * num_branches]
        out = torch.sigmoid(self.fc(out))        # Final output
        return out

# Example usage
model = MultiBranchCNN_LSTM()
print(model)

# Example input
batch_size = 32
dummy_input = torch.randint(0, vocab_size, (batch_size, max_sequence_length))
output = model(dummy_input)
print(output.shape)  # Should be [32, 1]


# **Additional**

In [None]:
print(len(X_test))
print(len(X_train))


feature_names = vectorizer.get_feature_names_out()
first_doc = X_train_vec[0]
for idx in first_doc.nonzero()[1]:
    print(feature_names[idx], first_doc[0, idx])


25000
25000


In [None]:
# Rebuild index-word dictionary if you haven't already
from tensorflow.keras.datasets import imdb

word_index = imdb.get_word_index()
index_word = {v + 3: k for k, v in word_index.items()}
index_word[0] = "<PAD>"
index_word[1] = "<START>"
index_word[2] = "<UNK>"
index_word[3] = "<UNUSED>"

# Example: decode the first X_test review
decoded_review = ' '.join([index_word.get(i, '?') for i in X_test[0]])
print(decoded_review)


<START> please give this one a miss br br <UNK> <UNK> and the rest of the cast <UNK> terrible performances the show is flat flat flat br br i don't know how michael <UNK> could have allowed this one on his <UNK> he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you <UNK> fans give this a miss


In [None]:
# Decode integer-encoded review
decoded_review = ' '.join([index_word.get(i, '?') for i in X_test[0]])
print("Review:\n", decoded_review)

# Convert to TF-IDF vector
review_tfidf = vectorizer.transform([decoded_review])

# Predict sentiment with a model (e.g., logistic regression)
log_reg_pred = log_reg.predict(review_tfidf)[0]
naive_bayes_pred = naive_bayes.predict(review_tfidf)[0]
svm_pred = svm.predict(review_tfidf)[0]

log_reg_label = "Positive" if log_reg_pred == 1 else "Negative"
naive_bayes_label = "Positive" if naive_bayes_pred == 1 else "Negative"
svm_label = "Positive" if svm_pred == 1 else "Negative"

print("\nPredicted Sentiment:", log_reg_label)
print("\nPredicted Sentiment:", naive_bayes_label)
print("\nPredicted Sentiment:", svm_label)


Review:
 <START> please give this one a miss br br <UNK> <UNK> and the rest of the cast <UNK> terrible performances the show is flat flat flat br br i don't know how michael <UNK> could have allowed this one on his <UNK> he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you <UNK> fans give this a miss

Predicted Sentiment: Negative

Predicted Sentiment: Negative

Predicted Sentiment: Negative


In [None]:
import numpy as np

# Get the coefficients from the trained logistic regression model
coefs = log_reg.coef_[0]  # 1D array of weights

# Get indices of non-zero TF-IDF terms for this review
feature_names = vectorizer.get_feature_names_out()
tfidf_vector = review_tfidf.toarray()[0]

# Sort features by their contribution to sentiment
top_indices = np.argsort(tfidf_vector * coefs)[::-1][:10]

print("\nTop contributing words to sentiment:")
for idx in top_indices:
    if tfidf_vector[idx] > 0:
        print(f"{feature_names[idx]} (weight: {coefs[idx]:.3f})")



Top contributing words to sentiment:
miss (weight: 1.055)
performances (weight: 1.771)
and (weight: 2.873)
his (weight: 1.400)
know (weight: 0.891)
performance (weight: 1.351)
fans (weight: 0.983)
you (weight: 2.463)
quite (weight: 1.267)
the (weight: 0.793)


# **Naive Bayes**

In [None]:
import numpy as np

# Get the log odds (positive - negative)
coefs = naive_bayes.feature_log_prob_[1] - naive_bayes.feature_log_prob_[0]

# Convert review to TF-IDF vector
review_tfidf = vectorizer.transform([decoded_review])
tfidf_vector = review_tfidf.toarray()[0]

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Compute weighted contributions
contributions = tfidf_vector * coefs
top_indices = np.argsort(contributions)[::-1][:10]

print("Top contributing words to sentiment (Naive Bayes): \n")
for idx in top_indices:
    if tfidf_vector[idx] > 0:
        print(f"{feature_names[idx]} (log-odds: {coefs[idx]:.3f})")


Top contributing words to sentiment (Naive Bayes): 

miss (log-odds: 0.452)
performances (log-odds: 0.750)
performance (log-odds: 0.639)
michael (log-odds: 0.347)
his (log-odds: 0.313)
fans (log-odds: 0.258)
quite (log-odds: 0.286)
cast (log-odds: 0.239)
show (log-odds: 0.209)
unk (log-odds: 0.080)


# **SVM**

In [None]:
import numpy as np

# Get the coefficients from the trained logistic regression model
coefs = svm.coef_[0]  # 1D array of weights

# Get indices of non-zero TF-IDF terms for this review
feature_names = vectorizer.get_feature_names_out()
tfidf_vector = review_tfidf.toarray()[0]

# Sort features by their contribution to sentiment
top_indices = np.argsort(tfidf_vector * coefs)[::-1][:10]

print("Top contributing words to sentiment:\n")
for idx in top_indices:
    if tfidf_vector[idx] > 0:
        print(f"{feature_names[idx]} (weight: {coefs[idx]:.3f})")


Top contributing words to sentiment:

miss (weight: 0.668)
performances (weight: 1.015)
fans (weight: 0.665)
know (weight: 0.392)
and (weight: 0.985)
his (weight: 0.548)
quite (weight: 0.633)
you (weight: 1.098)
performance (weight: 0.468)
the (weight: 0.502)
