<a href="https://www.kaggle.com/code/abaliyan/sentiment-analysis-using-ml-dl?scriptVersionId=142755006" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# DataCard

IMDB dataset having 50K movie reviews for natural language processing or Text analytics.

This is a dataset for binary sentiment classification. There are around 50,000 highly polar movie reviews. This can be used to analyse/predict positive and negative reviews using either classifiaction or deep learning algorithm.

# About Dataset
- Two column: "review" and "sentiment". 
- review column include the text comment from user and Sentiment column include if this comment is positive or negative 

# Code with python

In [None]:
# importing libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report


In [None]:
# import dataset using read_csv method from pandas 
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
# check dataset
data.head()

In [None]:
data.tail()

In [None]:
# analysing count of positive and negative sentiments
print(pd.DataFrame(data['sentiment'].value_counts()))
plt.pie(x=data['sentiment'].value_counts(), labels=data['sentiment'].value_counts().keys(), autopct='%1.1f%%')
plt.title("Share of positive and negative reviews")
plt.show()

- dataset is balanced with equal number of positive and negative observations 

In [None]:
# downloading "stopwords" and "wordnet" from "Natural Language Toolkit" to play with text data 
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
#Tokenization, stopword removal, text lowercase, and stemming
def preprocess_text(text):
    """function that take text input and apply below operation to clean the text.
    1. removing special characters
    2. create token from the text input and get lowercase
    3. remove stopwords as per NLTK stopwords library
    4. using PorterStemmer for stemming the token
    5. combine all tokens back to text 
    6. return the processed text"""
    
    text = re.sub(r'<[^>]+>', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    stemmed_tokens = [PorterStemmer().stem(token) for token in filtered_tokens]
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text.lower()

data['review'] = data['review'].apply(preprocess_text)

In [None]:
# Map sentiment labels to binary values (0 for negative, 1 for positive)
data['sentiment'] = data['sentiment'].map({'negative': 0, 'positive': 1})
data.head()

In [None]:
# split data for test and training dataset 
X_train,X_test,y_train,y_test=train_test_split(data.review,data.sentiment,test_size=0.5, random_state=42)

# ML model employed
- LogisticRegression
- RandomForestClassifier
- KNeighbourClassifier
- MultinomialNB

In [None]:
result_summary = pd.DataFrame(columns=['Model','Precision','Recall'])
result_summary

In [None]:

def score_model(model,test_predictions, result_summary):
    """Function to get the summary of the predictions obtained.
    prints the accuracy, precision score, recall score, confusion matrix, and classification report."""
    
    print("\nSummary Report\n")
    accuracy = accuracy_score(y_test, test_predictions)
    print("Accuracy:", accuracy)

    # Calculate precision and recall
    precision = precision_score(y_test, test_predictions)
    recall = recall_score(y_test, test_predictions)
    print("Precision:", precision)
    print("Recall:", recall)

    # confusion matrix
    conf_matrix = confusion_matrix(y_test, test_predictions)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Generate classification report
    class_names = ['negative', 'positive']  # Replace with your class labels
    classification_rep = classification_report(y_test, test_predictions, target_names=class_names)
    print("\nClassification Report:")
    print(classification_rep)
    print()
    
    # adding result to summary report
    temp_row = pd.DataFrame([{'Model':model,'Precision':precision,'Recall':recall}])
    result_summary=pd.concat([result_summary, temp_row], ignore_index=True) 
    return(result_summary)

In [None]:
# create pipeline for CountVectorizer and LogisticRegression

lr=Pipeline([('vectorizer', CountVectorizer()),('classifier',LogisticRegression())])

lr.fit(X_train,y_train)

y_pred=lr.predict(X_test)

result_summary = score_model('LogisticRegression',y_pred,result_summary)
print(result_summary)

In [None]:
# applying RandomForestClassifier

rfc = Pipeline([('vectorizer', CountVectorizer()),                                                    #initializing the vectorizer
    ('random_forest', (RandomForestClassifier(n_estimators=50, criterion='entropy')))      #using the RandomForest classifier
])

rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

result_summary = score_model('RandomForestClassifier',y_pred, result_summary)
print(result_summary)

# KNeighboursClassifier

In [None]:
# applying KNeighborsClassifier

knc = Pipeline([
     ('vectorizer', CountVectorizer()),
      ('KNN', (KNeighborsClassifier(n_neighbors=10, metric = 'euclidean')))   #using the KNN classifier with 10 neighbors
])

knc.fit(X_train, y_train)

y_pred = knc.predict(X_test)

result_summary = score_model('KNeighborsClassifier',y_pred,result_summary)
result_summary

# MultinomialNB

In [None]:
# applying MultinomialNB

mnb = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

mnb.fit(X_train, y_train)

y_pred = mnb.predict(X_test)

result_summary = score_model('MultinomialNB',y_pred,result_summary)
result_summary

# ConvolutionNeuralNetwork

In [None]:
# importing function and model for CNN

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten,Conv1D, GlobalMaxPooling1D, Dense, LSTM
from tensorflow.keras.activations import relu, sigmoid


In [None]:
# preprocessing text

lm = WordNetLemmatizer()
def text_processing(df_col):
    corpus = []
    for item in df_col:
        new_item = re.sub('[^a-zA-Z]',' ',str(item))
        new_item = new_item.lower()
        new_item = new_item.split()
        new_item = [lm.lemmatize(word) for word in new_item if word not in set(stopwords.words('english'))]
        corpus.append(' '.join(str(x) for x in new_item))
    return corpus

# data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project work/IMDB movies review sentiment analysis/IMDB Dataset.csv")
df=data.copy()
# df['review'] = text_processing(df['review'])

In [None]:
# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.5, random_state=42)

In [None]:
# Tokenization and padding
max_words = 1000  # Maximum number of words in your vocabulary
max_seq_length = 100  # Maximum length of sequences (words in a review)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['review'])

X_train = tokenizer.texts_to_sequences(train_df['review'])
X_test = tokenizer.texts_to_sequences(test_df['review'])

X_train = pad_sequences(X_train, maxlen=max_seq_length, padding = 'post')
X_test = pad_sequences(X_test, maxlen=max_seq_length, padding='post')

y_train = np.array(train_df['sentiment'])
y_test = np.array(test_df['sentiment'])

In [None]:
# Build the CNN model
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_seq_length))
model.add(Conv1D(filters=128, kernel_size=5, activation=relu))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation=relu))
model.add(Dense(1, activation=sigmoid))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
epochs = 10
batch_size = 32

history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, batch_size=batch_size)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

In [None]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Train Loss', marker='o')
plt.plot(val_loss, label='Validation Loss', marker='o')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid()
plt.legend()


plt.subplot(1, 2, 2)
plt.plot(train_accuracy, label='Train Accuracy', marker='o')
plt.plot(val_accuracy, label='Validation Accuracy', marker='o')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid()
plt.legend()

plt.tight_layout()
plt.show()

test_predictions_probs = model.predict(X_test)
test_predictions = (test_predictions_probs > 0.5).astype(int)  # Convert probabilities to binary predictions
#score_model(test_predictions)

result_summary = score_model('CNN',test_predictions,result_summary)

In [None]:
result_summary

From above result summary, we can conclude that the logistic regression is performing the best in classififying the sentiments.