#***1. Using NLP***

In [1]:
# importing relevant libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

## **Installing imdb dataset**
--> About Dataset:
     

1.   The load_dataset("imdb") command will fetch the IMDb dataset this
     contains three splits:

        - Train       : consisting of 25000 rows
        - Test        : consisting of 50000 rows
        - Unsupervised: consisting of 50000 rows
2.   Featuresfeatures: ['text', 'label']



In [2]:
# Install the datasets library
!pip install datasets

# Import necessary libraries
from datasets import load_dataset

# Load the IMDb dataset
dataset = load_dataset("imdb")

print(dataset)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


## **Data preprocessing**

In [3]:
train_data = dataset['train']
test_data = dataset['test']
unsupervised_data = dataset['unsupervised']

# Convert to pandas DataFrame
train_df = pd.DataFrame({
    'review': train_data['text'],
    'sentiment': train_data['label']
})

test_df = pd.DataFrame({
    'review': test_data['text'],
    'sentiment': test_data['label']
})

In [4]:
# Merging all the three datasetsas one

df= pd.concat([test_df, train_df], axis =0)
df.shape

(50000, 2)

In [5]:
# Checking for null values

df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [6]:
# Checking for duplicate values

df.duplicated().sum()

418

In [7]:
# Deleting all duplicate values

df.drop_duplicates(inplace = True)

In [8]:
# Shape of data frame

df.shape

(49582, 2)

In [9]:
# Columns

df.columns

Index(['review', 'sentiment'], dtype='object')

In [10]:
# Checking for imbalance dataset

df['sentiment'].value_counts().reset_index()          # Balanced data

Unnamed: 0,sentiment,count
0,1,24884
1,0,24698


In [11]:
# checking for datatypes

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49582 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49582 non-null  object
 1   sentiment  49582 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


## **Featured Engineering**

In [12]:
# Tokenization and Lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Lemmatize and remove stopwords
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])

df['processed'] = df['review'].apply(preprocess)
df['processed'].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,processed
0,I love sci-fi willing put lot . Sci-fi movies/...
1,"Worth entertainment value rental , especially ..."
2,totally average film semi-alright action seque...
3,STAR RATING : * * * * * Saturday Night * * * *...
4,"First let say , If n't enjoyed Van Damme movie..."


In [13]:
# Vectorization

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['processed'])
y = df['sentiment']

In [14]:
# Test, train Split of dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## **Model prearation & training**

In [15]:
# Building model

from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(16, 8), activation='relu', solver='adam', max_iter=50, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [16]:
# Predicting the model

y_pred = model.predict(X_test)

In [18]:
# Evaluating the classification model

from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, accuracy_score
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

auc = roc_auc_score(y_test, y_pred)
print("\nAUC-ROC Score:", auc)

report = classification_report(y_test, y_pred)
print("\nClassification Report:\n", report)

Confusion Matrix:
 [[6517  977]
 [ 867 6514]]

Accuracy: 0.8760336134453781

AUC-ROC Score: 0.8760826391321249

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88      7494
           1       0.87      0.88      0.88      7381

    accuracy                           0.88     14875
   macro avg       0.88      0.88      0.88     14875
weighted avg       0.88      0.88      0.88     14875



##**Building interactive Web Interface with Gradio for ML Model**

In [19]:
#installing gradion

! pip install gradio

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.114.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

In [32]:
import gradio as gr

# Define the prediction function
def predict_sentiment_function(new_sentence):
    # Preprocess the new sentence (tokenize, lemmatize, etc.)
    processed_new_sentence = preprocess(new_sentence)

    # Vectorize the processed sentence
    vectorized_new_sentence = vectorizer.transform([processed_new_sentence])

    # Predict the sentiment using the model
    predicted_sentiment = model.predict(vectorized_new_sentence)

    # Convert the prediction to a label ('positive' or 'negative')
    predicted_sentiment_label = 'positive' if predicted_sentiment[0] == 1 else 'negative'

    return predicted_sentiment_label

# Gradio Interface
iface = gr.Interface(
    fn=predict_sentiment_function,  # The function that performs sentiment prediction
    inputs='text',                  # Input type: a text box
    outputs='text'                  # Output type: a text label
)

# Launch the Gradio interface
iface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://803e37e654051c7ce3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




#***2. Using LSTM***

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")  # oov_token handles out-of-vocabulary words

# Fit the tokenizer on the training data
tokenizer.fit_on_texts(X_train)

# Calculate vocab_size after fitting the tokenizer
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding

# Tokenization: Convert words to integers
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Padding sequences to ensure equal length
max_length = 200  # Set a max length for sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post')

# Now X_train_padded and X_test_padded contain tokenized and padded sequences

##**Model preparation & training**


In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define vocab_size (make sure it's defined based on your tokenizer)
vocab_size = len(tokenizer.word_index) + 1  # Example vocab size

# Build the LSTM model
model = Sequential()

# Embedding layer to convert words to dense vectors of fixed size
embedding_dim = 64
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))

# LSTM layer
model.add(LSTM(units=64))  # LSTM layer with 64 units

# Add a dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Output layer (binary classification)
model.add(Dense(1, activation='sigmoid'))  # 'sigmoid' for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Build the model to avoid unbuilt status
model.build(input_shape=(None, max_length))

# Print model summary
model.summary()




In [26]:
print("X_train_padded shape:", X_train_padded.shape)
print("X_test_padded shape:", X_test_padded.shape)

X_train_padded shape: (34707, 200)
X_test_padded shape: (14875, 200)


In [27]:
# Training the model
history = model.fit(X_train_padded, y_train, validation_split=0.2, epochs=5, batch_size=64)

Epoch 1/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 287ms/step - accuracy: 0.5362 - loss: 0.6840 - val_accuracy: 0.7358 - val_loss: 0.5668
Epoch 2/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 271ms/step - accuracy: 0.7291 - loss: 0.5593 - val_accuracy: 0.6210 - val_loss: 0.6303
Epoch 3/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 273ms/step - accuracy: 0.5810 - loss: 0.6605 - val_accuracy: 0.6448 - val_loss: 0.5966
Epoch 4/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 276ms/step - accuracy: 0.6710 - loss: 0.5568 - val_accuracy: 0.8436 - val_loss: 0.3904
Epoch 5/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 260ms/step - accuracy: 0.8996 - loss: 0.2824 - val_accuracy: 0.8725 - val_loss: 0.3250


In [28]:
# Predicting the model

y_pred = model.predict(X_test_padded)

[1m465/465[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 37ms/step


In [30]:
y_pred

array([[0.03330117],
       [0.5463036 ],
       [0.06927688],
       ...,
       [0.9622271 ],
       [0.9482817 ],
       [0.04312459]], dtype=float32)

In [31]:
y_npred = []

for i in y_test:
  if i > 0.5:
    y_npred.append(1)
  else:
    y_npred.append(0)

In [32]:
y_npred = np.array(y_npred)
y_npred

array([0, 1, 1, ..., 1, 1, 0])

In [34]:
y_test = np.array(y_test)
y_test

array([0, 1, 1, ..., 1, 1, 0])

In [36]:
# Evaluating the classification model

from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, accuracy_score

cm = confusion_matrix(y_test, y_npred)
print("Confusion Matrix:\n", cm)

accuracy = accuracy_score(y_test, y_npred)
print("\nAccuracy:", accuracy)

report = classification_report(y_test, y_npred)
print("\nClassification Report:\n", report)

auc = roc_auc_score(y_test, y_npred)
print("\nAUC-ROC Score:", auc)


Confusion Matrix:
 [[7494    0]
 [   0 7381]]

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7494
           1       1.00      1.00      1.00      7381

    accuracy                           1.00     14875
   macro avg       1.00      1.00      1.00     14875
weighted avg       1.00      1.00      1.00     14875


AUC-ROC Score: 1.0


##**Building interactive Web Interface with Gradio for ML Model**

In [37]:
import gradio as gr

# Define the prediction function
def predict_sentiment_lstm(new_sentence):
     # Preprocess the input sentence (tokenization + padding)
    sequence = tokenizer.texts_to_sequences([new_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')

    # Predict the sentiment using the model
    predicted_sentiment = model.predict(padded_sequence)

    # Predict sentiment
    predicted_sentiment = model.predict(padded_sequence)[0][0]
    predicted_sentiment_label = 'positive' if predicted_sentiment > 0.5 else 'negative'

    return predicted_sentiment_label

# Gradio Interface
iface = gr.Interface(
    fn=predict_sentiment_lstm,  # The function that performs sentiment prediction
    inputs='text',                  # Input type: a text box
    outputs='text'                  # Output type: a text label
)

# Launch the Gradio interface
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://b560611fcd8e0537d3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


