<a href="https://colab.research.google.com/github/1233san/1233san/blob/main/64945f485f46d1687445320.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the libraries

In [None]:
# Numpy is used for numerical computations and working with arrays
import numpy as np
#Pandas is used for data manipulation and analysis
import pandas as pd

## Data Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#loading the data
df=pd.read_excel('/content/drive/MyDrive/Colab Notebook/Restaurant_Reviews.xlsx')

## Data Analysis

In [None]:
df.shape

(1000, 2)

In [None]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [None]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
df.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [None]:
df.describe()

Unnamed: 0,Liked
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [None]:
df.sample(5)

Unnamed: 0,Review,Liked
291,Just don't know why they were so slow.,0
545,Just spicy enough.. Perfect actually.,1
770,My sashimi was poor quality being soggy and ta...,0
107,The ambience is wonderful and there is music p...,1
524,Those burgers were amazing.,1


In [None]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [None]:
df['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

# Feature Engineering

In [None]:
df['Length'] = df['Review'].apply(len)
df.head(5)

Unnamed: 0,Review,Liked,Length
0,Wow... Loved this place.,1,24
1,Crust is not good.,0,18
2,Not tasty and the texture was just nasty.,0,41
3,Stopped by during the late May bank holiday of...,1,87
4,The selection on the menu was great and so wer...,1,59


# Data Preprocessing

In [None]:
# Importing the NLP Libraries
import nltk
import re
# Download NLTK stopwords data
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SATEESH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(list(stopwords.words('english')))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# Cleaning the Reviews and Creating a Corpus
corpus = []  # Initialize an empty list to store the cleaned reviews.

# Loop through each review
for i in range(len(df)):
    # Clean and preprocess the review
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])  # Remove non-alphabetical characters
    review = review.lower()  # Convert text to lowercase
    review_words = review.split()  # Tokenization
    review_words = [word for word in review_words if word not in set(stopwords.words('english'))]  # Remove Stop Words
    review_words = [stemmer.stem(word) for word in review_words]  # Stemming
    review = ' '.join(review_words)  # Rejoin Tokens
    corpus.append(review)  # Append to Corpus

In [None]:
corpus[:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

# Creating a Bag of Words Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features=1500, min_df=2)
x = tf.fit_transform(corpus).toarray() # Transform the text data into TF-IDF features
X = tf.fit_transform(corpus).toarray()
y = df['Liked'].values
Y = df['Liked'].values

# Data Splitting

In [None]:
from sklearn.model_selection import train_test_split, KFold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 693), (200, 693), (800,), (200,))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, accuracy_score

# Model Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

# Model-1: Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=4, min_samples_leaf=2, random_state=41)
num_folds = 11
kf = KFold(n_splits=num_folds, shuffle=True, random_state=41)

train_accuracies_rf = []
test_accuracies_rf = []

for train_index, test_index in kf.split(X):
    X_train_rf, X_test_rf = X[train_index], X[test_index]
    y_train_rf, y_test_rf = y[train_index], y[test_index]  # Use 'y' consistently

    rf_model.fit(X_train_rf, y_train_rf)
    y_train_pred_rf = rf_model.predict(X_train_rf)
    y_test_pred_rf = rf_model.predict(X_test_rf)

    train_accuracy_rf = accuracy_score(y_train_rf, y_train_pred_rf)
    test_accuracy_rf = accuracy_score(y_test_rf, y_test_pred_rf)

    train_accuracies_rf.append(train_accuracy_rf)
    test_accuracies_rf.append(test_accuracy_rf)

average_train_accuracy_rf = np.mean(train_accuracies_rf)
average_test_accuracy_rf = np.mean(test_accuracies_rf)
accuracy_difference_rf = abs(average_train_accuracy_rf - average_test_accuracy_rf)


# Model-2: Support Vector Classifier (SVC)
classifier = SVC(kernel='linear', C=0.1, random_state=40)

train_accuracies_svc = []
test_accuracies_svc = []

classifier.fit(X_train, y_train)
y_train_pred_svc = classifier.predict(X_train)
y_test_pred_svc = classifier.predict(X_test)

test_accuracy_svc = accuracy_score(y_test, y_test_pred_svc)
train_accuracy_svc = accuracy_score(y_train, y_train_pred_svc)

train_accuracies_svc.append(train_accuracy_svc)
test_accuracies_svc.append(test_accuracy_svc)

average_train_accuracy_svc = np.mean(train_accuracies_svc)
average_test_accuracy_svc = np.mean(test_accuracies_svc)
accuracy_difference_svc = abs(average_train_accuracy_svc - average_test_accuracy_svc)



# Model Evaluation

In [None]:
print("Model-1 :: RandomForestClassifier")
print(f"Training Accuracy: {average_train_accuracy_rf * 100:.2f}%")
print(f"Testing Accuracy: {average_test_accuracy_rf * 100:.2f}%")
print("Accuracy Difference: {:.2f}%".format(accuracy_difference_rf * 100))

Model-1 :: RandomForestClassifier
Training Accuracy: 80.53%
Testing Accuracy: 74.31%
Accuracy Difference: 6.22%


In [None]:
print("Confusion Matrix of RandomForest :")
confusion_matrix_rf = confusion_matrix(y_test, rf_model.predict(X_test))
print(confusion_matrix_rf)

Confusion Matrix of RandomForest :
[[98  5]
 [28 69]]


In [None]:
print("model-2 :: SVC")
print(f"Training Accuracy: {average_train_accuracy_svc * 100:.2f}%")
print(f"Testing Accuracy: {average_test_accuracy_svc * 100:.2f}%")
print("Accuracy Difference: {:.2f}%".format(accuracy_difference_svc * 100))

model-2 :: SVC
Training Accuracy: 80.00%
Testing Accuracy: 78.00%
Accuracy Difference: 2.00%


In [None]:
print("Confusion Matrix of SVC")
confusion_matrix_svc = confusion_matrix(y_test, y_test_pred_svc)
print(confusion_matrix_svc)

Confusion Matrix of SVC
[[95  8]
 [36 61]]


# Predictions

In [None]:
def predict_sentiment(sample_review, classifier, tf):
    X =  tf.transform([review]).toarray()
    # Use the pre-trained classifier to predict sentiment
    sentiment = classifier.predict(X)

    # Post-processing: Check for positive words
    positive_words = ["good", "excellent", "amazing", "delicious"]  # Add more positive words as needed
    for word in positive_words:
        if word in sample_review.lower():
            sentiment = [1]  # Override sentiment to positive

    return sentiment[0]

# Sample reviews as strings
reviews = [
    'The food is really bad.',
    'I love their delicious dishes!',
    'Terrible experience. Avoid this place.',
    'The service was excellent.',
    'Worst place ever, but nice food'
]

# Assuming you have already defined the 'predict_sentiment' function, classifier, and tf

for review in reviews:
    sentiment = predict_sentiment(review, classifier, tf)
    if sentiment:
        sentiment_label = 'POSITIVE'
    else:
        sentiment_label = 'NEGATIVE'

    print(f"Review: '{review}'")
    print(f"Sentiment: {sentiment_label}")
    print()


Review: 'The food is really bad.'
Sentiment: NEGATIVE

Review: 'I love their delicious dishes!'
Sentiment: POSITIVE

Review: 'Terrible experience. Avoid this place.'
Sentiment: NEGATIVE

Review: 'The service was excellent.'
Sentiment: POSITIVE

Review: 'Worst place ever, but nice food'
Sentiment: NEGATIVE

