Rcurrent Neural Network RNN

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [None]:
data=pd.read_csv('swiggy.csv')
data.head()

Unnamed: 0,ID,Area,City,Restaurant Price,Avg Rating,Total Rating,Food Item,Food Type,Delivery Time,Review
0,1,Suburb,Ahmedabad,600,4.2,6198,Sushi,Fast Food,30-40 min,"Good, but nothing extraordinary."
1,2,Business District,Pune,200,4.7,4865,Pepperoni Pizza,Non-Vegetarian,50-60 min,"Good, but nothing extraordinary."
2,3,Suburb,Bangalore,600,4.7,2095,Waffles,Fast Food,50-60 min,Late delivery ruined it.
3,4,Business District,Mumbai,900,4.0,6639,Sushi,Vegetarian,50-60 min,Best meal I've had in a while!
4,5,Tech Park,Mumbai,200,4.7,6926,Spring Rolls,Gluten-Free,20-30 min,Mediocre experience.


In [None]:
# text cleaning and sentiment labeling
data['Review']=data['Review'].apply(lambda x: x.lower())
data['Review']=data['Review'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))
data['Sentiment']=data['Avg Rating'].apply(lambda x: 1 if x>=3.5 else 0)
# Apply synonym replacement to the 'Review' column of data_downsampled
#data_downsampled['augmented_review'] = data_downsampled['Review'].apply(lambda x: synonym_replacement(x))  # Creating the 'augmented_review' column in data_downsampled

In [None]:
data['Review']

Unnamed: 0,Review
0,good but nothing extraordinary
1,good but nothing extraordinary
2,late delivery ruined it
3,best meal ive had in a while
4,mediocre experience
...,...
7995,my new favorite dish
7996,amazing taste and quick delivery
7997,nothing special but edible
7998,it was okay


In [None]:
data.head()

Unnamed: 0,ID,Area,City,Restaurant Price,Avg Rating,Total Rating,Food Item,Food Type,Delivery Time,Review,Sentiment
0,1,Suburb,Ahmedabad,600,4.2,6198,Sushi,Fast Food,30-40 min,good but nothing extraordinary,1
1,2,Business District,Pune,200,4.7,4865,Pepperoni Pizza,Non-Vegetarian,50-60 min,good but nothing extraordinary,1
2,3,Suburb,Bangalore,600,4.7,2095,Waffles,Fast Food,50-60 min,late delivery ruined it,1
3,4,Business District,Mumbai,900,4.0,6639,Sushi,Vegetarian,50-60 min,best meal ive had in a while,1
4,5,Tech Park,Mumbai,200,4.7,6926,Spring Rolls,Gluten-Free,20-30 min,mediocre experience,1


In [None]:
# chaeck the imbalance
data['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
1,6857
0,1143


In [None]:
# prompt: down sampling

import pandas as pd
# Downsample the majority class (Sentiment = 1)
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = data[data['Sentiment'] == 1]
df_minority = data[data['Sentiment'] == 0]

# Downsample majority class
df_majority_downsampled = resample(df_majority,
                                 replace=False,    # sample without replacement
                                 n_samples=len(df_minority),     # to match minority class
                                 random_state=123) # reproducible results

# Combine minority class with downsampled majority class
data_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
data_downsampled['Sentiment'].value_counts()


Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
1,1143
0,1143


In [None]:
# tokenization and padding (limited sequence length)
max_features=3000  # max number of words to be learned
max_len=100
tokenizer=Tokenizer(num_words=max_features, oov_token='<oov>')
tokenizer.fit_on_texts(data['Review'])
x=pad_sequences(tokenizer.texts_to_sequences(data['Review']), maxlen=max_len)


In [None]:
print(data['Review'])

0         good but nothing extraordinary
1         good but nothing extraordinary
2                late delivery ruined it
3           best meal ive had in a while
4                    mediocre experience
                      ...               
7995                my new favorite dish
7996    amazing taste and quick delivery
7997          nothing special but edible
7998                         it was okay
7999                 delicious and fresh
Name: Review, Length: 8000, dtype: object


In [None]:
y=data['Sentiment']

In [None]:
# splitting the dataset
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.layers import GlobalAveragePooling1D  # Import GlobalAveragePooling1D

# Buildind the model
model=Sequential([
    Embedding(input_dim=max_features, output_dim=64, input_length=max_len),
    SimpleRNN(64, return_sequences=True),  # Keep return_sequences=True
    GlobalAveragePooling1D(),              # Add GlobalAveragePooling1D layer
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.build(input_shape=(None, max_features))



In [None]:
model.summary()

In [None]:
# compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#teain the model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - accuracy: 0.8366 - loss: 0.4329 - val_accuracy: 0.8578 - val_loss: 0.4141
Epoch 2/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.8619 - loss: 0.4063 - val_accuracy: 0.8578 - val_loss: 0.4101
Epoch 3/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - accuracy: 0.8554 - loss: 0.4151 - val_accuracy: 0.8578 - val_loss: 0.4091
Epoch 4/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.8654 - loss: 0.3963 - val_accuracy: 0.8578 - val_loss: 0.4091
Epoch 5/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.8595 - loss: 0.4078 - val_accuracy: 0.8578 - val_loss: 0.4103
Epoch 6/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.8490 - loss: 0.4275 - val_accuracy: 0.8578 - val_loss: 0.4099
Epoch 7/10
[1m160/160

<keras.src.callbacks.history.History at 0x7ed2ec96bc90>

In [None]:
loss, accuracy=model.evaluate(x_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8541 - loss: 0.4171
Test Loss: 0.4081, Test Accuracy: 0.8587


In [None]:
def predict_sentiment(review):
    review=review.lower()
    review=re.sub('[^a-zA-Z0-9\s]', '', review)
    review_seq=tokenizer.texts_to_sequences([review])
    padded_review=pad_sequences(review_seq, maxlen=max_len)
    prediction=model.predict(padded_review)
    sentiment='Positive' if prediction[0][0] >= 0.5 else 'Negative'
    return sentiment, prediction[0][0]

In [None]:
sample_review='The food was delicious and the service was excellent.'
sentiment, confidence=predict_sentiment(sample_review)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step
Sentiment: Positive, Confidence: 0.8562


In [None]:
sample_review='The food was terrible and the service was awful.'
sentiment, confidence=predict_sentiment(sample_review)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Sentiment: Positive, Confidence: 0.8538


In [None]:
sample_review='The food was not tasty , i will not recomended.'
sentiment, confidence=predict_sentiment(sample_review)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Sentiment: Positive, Confidence: 0.8539


In [None]:
sample_review='bad'
sentiment, confidence=predict_sentiment(sample_review)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Sentiment: Positive, Confidence: 0.8740


In [None]:
!pip install nltk

import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from sklearn.utils import resample
import nltk
from nltk.corpus import wordnet

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')

# 1. Data Loading and Cleaning
data = pd.read_csv('swiggy.csv')
data['Review'] = data['Review'].apply(lambda x: x.lower())
data['Review'] = data['Review'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))
data['Sentiment'] = data['Avg Rating'].apply(lambda x: 1 if x >= 3.5 else 0)

# 2. Advanced Data Augmentation (Synonym Replacement)
def synonym_replacement(text, prob=0.1):
    words = text.split()
    new_words = []
    for word in words:
        if word.isalpha() and np.random.rand() < prob:
            synonyms = wordnet.synsets(word)
            if synonyms:
                synonym = synonyms[0].lemmas()[0].name()
                if synonym != word:
                    new_words.append(synonym)
                else:
                    new_words.append(word)
            else:
                new_words.append(word)
        else:
            new_words.append(word)
    return ' '.join(new_words)

# 3. Downsampling
df_majority = data[data['Sentiment'] == 1]
df_minority = data[data['Sentiment'] == 0]
df_majority_downsampled = resample(df_majority,
                                 replace=False,
                                 n_samples=len(df_minority),
                                 random_state=123)
data_downsampled = pd.concat([df_majority_downsampled, df_minority])

# 4. Apply Augmentation to Downsampled Data
data_downsampled['augmented_review'] = data_downsampled['Review'].apply(lambda x: synonym_replacement(x))

# 5. Tokenization and Padding
max_features = 5000
max_len = 150
tokenizer = Tokenizer(num_words=max_features, oov_token='<oov>')
tokenizer.fit_on_texts(data_downsampled['augmented_review'])
x = pad_sequences(tokenizer.texts_to_sequences(data_downsampled['augmented_review']), maxlen=max_len)
y = data_downsampled['Sentiment']

# 6. Splitting the Dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# 7. Model Building
model = Sequential([
    Embedding(input_dim=max_features, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(LSTM(32)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# 8. Model Compilation
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 9. Model Training
model.fit(x_train, y_train, epochs=15, batch_size=64, validation_split=0.2)

# 10. Evaluation and Prediction (similar to your original code)
# ...



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Epoch 1/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 558ms/step - accuracy: 0.5089 - loss: 0.6944 - val_accuracy: 0.4727 - val_loss: 0.7001
Epoch 2/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 583ms/step - accuracy: 0.5379 - loss: 0.6915 - val_accuracy: 0.5273 - val_loss: 0.6925
Epoch 3/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 557ms/step - accuracy: 0.5088 - loss: 0.6928 - val_accuracy: 0.4781 - val_loss: 0.6942
Epoch 4/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 539ms/step - accuracy: 0.5212 - loss: 0.6924 - val_accuracy: 0.4781 - val_loss: 0.6943
Epoch 5/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 640ms/step - accuracy: 0.5515 - loss: 0.6896 - val_accuracy: 0.5055 - val_loss: 0.6948
Epoch 6/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 621ms/step - accuracy: 0.5333 - loss: 0.6887 - val_accuracy: 0.4918 - val_loss: 0.6989
Epoch 7/15
[1m23/23[

<keras.src.callbacks.history.History at 0x78fa204867d0>

In [None]:
loss, accuracy=model.evaluate(x_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - accuracy: 0.4921 - loss: 0.7080
Test Loss: 0.7025, Test Accuracy: 0.5131


In [None]:
# prompt: save the model

model.save('sentiment_analysis_model.h5')
!ls -l sentiment_analysis_model.h5




-rw-r--r-- 1 root root 9455816 Apr 25 15:04 sentiment_analysis_model.h5


In [None]:
def predict_sentiment(review):
    review=review.lower()
    review=re.sub('[^a-zA-Z0-9\s]', '', review)
    review_seq=tokenizer.texts_to_sequences([review])
    padded_review=pad_sequences(review_seq, maxlen=max_len)
    prediction=model.predict(padded_review)
    sentiment='Positive' if prediction[0][0] >= 0.5 else 'Negative'
    return sentiment, prediction[0][0]

In [None]:
sample_review='The food was terrible and the service was awful.'
sentiment, confidence=predict_sentiment(sample_review)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Sentiment: Negative, Confidence: 0.4931


In [None]:
sample_review='i dont recommended.'
sentiment, confidence=predict_sentiment(sample_review)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Sentiment: Negative, Confidence: 0.4638


In [None]:
sample_review='tasty food , goog service.'
sentiment, confidence=predict_sentiment(sample_review)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
Sentiment: Positive, Confidence: 0.5111


In [None]:
# prompt: call the saved model

from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the saved model
model = load_model('sentiment_analysis_model.h5')

# Assuming you have the tokenizer defined from your previous code
# ... (Tokenizer definition and fitting from your previous code)

max_features = 5000
max_len = 150
# Instead of creating a new tokenizer, you should load the tokenizer used during training.
# This could involve saving the tokenizer separately (e.g., using pickle) and loading it here.
# For example, if you saved the tokenizer as 'tokenizer.pickle':
# import pickle
# with open('tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

# If you haven't saved the tokenizer separately, you'll need to re-create it
# and fit it on the same data used during training.
# For example:
# data = pd.read_csv('swiggy.csv') # Assuming 'swiggy.csv' is your original data file
# data['Review'] = data['Review'].apply(lambda x: x.lower())
# data['Review'] = data['Review'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))
# tokenizer = Tokenizer(num_words=max_features, oov_token='<oov>')
# tokenizer.fit_on_texts(data['Review'])  # Or fit on the augmented data if used during training

# For this example, I'll create a new tokenizer and fit it to some sample text:
tokenizer = Tokenizer(num_words=max_features, oov_token='<oov>')
tokenizer.fit_on_texts(['The food was terrible and the service was awful.',
                       'i dont recommended.',
                       'tasty food , goog service.'])

def predict_sentiment(review):
    review = review.lower()
    review = re.sub('[^a-zA-Z0-9\s]', '', review)
    review_seq = tokenizer.texts_to_sequences([review])
    padded_review = pad_sequences(review_seq, maxlen=max_len)
    prediction = model.predict(padded_review)
    sentiment = 'Positive' if prediction[0][0] >= 0.5 else 'Negative'
    return sentiment, prediction[0][0]

# Example usage
sample_review = 'superb packaging and presentation'
sentiment, confidence = predict_sentiment(sample_review)
print(f'Sentiment: {sentiment}, Confidence: {confidence:.4f}')



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 652ms/step
Sentiment: Positive, Confidence: 0.5114
