In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings


In [None]:
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('Restaurant_Reviews.tsv',sep='\t')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data['Liked'].value_counts()

In [None]:
data.head()

In [None]:
data['char_count']=data['Review'].apply(len)

In [None]:
data.head()

In [None]:
data['word_count']=data['Review'].apply(lambda x: len(str(x).split()))

In [None]:
data.head()

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
data['sent_count'] = data['Review'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [None]:
# Avg char count for Positive reviews
print("Avg char count for Positive:", data[data['Liked'] == 1]['char_count'].mean())

# Avg char count for Negative reviews
print("Avg char count for Negative:", data[data['Liked'] == 0]['char_count'].mean())

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk

nltk.download('stopwords')

# Define Custom Stop Words to Keep (crucial for sentiment)
custom_stopwords = ['not', 'no', 'don', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']

# Initialize Stemmer and Custom Stop Word Set
ps = PorterStemmer()
stop_words = set(stopwords.words('english')) - set(custom_stopwords)

Corpus = []

In [None]:
for i in range(len(data)):
    # 1. Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])

    # 2. Convert to lowercase and split into words
    review = review.lower().split()

    # 3. Apply Stemming and Stop Word Removal
    review = [ps.stem(word) for word in review if word not in stop_words]

    # 4. Join the words back into a single string
    review = ' '.join(review)

    # 5. Append to Corpus
    Corpus.append(review)

data['Processed_Text'] = Corpus
data.head()

In [None]:
from wordcloud import WordCloud

# Initialize Word Cloud object
wc = WordCloud(width=800, height=800, min_font_size=8, background_color='white')

# Generate Word Cloud for Positive Reviews
pos = wc.generate(' '.join(data[data['Liked'] == 1]['Processed_Text']))
plt.figure(figsize=(10, 5))
plt.title("Word Cloud for Positive Reviews")
plt.imshow(pos)
plt.axis("off")
plt.show()

In [None]:
# Generate Word Cloud for Negative Reviews
negative = wc.generate(' '.join(data[data['Liked'] == 0]['Processed_Text']))
plt.figure(figsize=(10, 5))
plt.title("Word Cloud for Negative Reviews")
plt.imshow(negative)
plt.axis("off")
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize Count Vectorizer, limiting to 1500 most frequent features
cv = CountVectorizer(max_features=1500)

# Convert Corpus to Feature Matrix (X)
X = cv.fit_transform(Corpus).toarray()

# Define Target Variable (y)
y = data['Liked']

X.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Train
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict and Evaluate
y_pred_gnb = gnb.predict(X_test)
print("Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_gnb))

In [None]:
from sklearn.linear_model import LogisticRegression

# Train
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Predict and Evaluate
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Predict and Evaluate
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

In [None]:
import joblib

# The video selects Random Forest as the best model
joblib.dump(rf, 'restaurant_review_model.sav')
joblib.dump(cv, 'count_vectorizer.sav') # Saving the vectorizer is good practice too

print("Random Forest model and CountVectorizer saved.")

In [None]:
import joblib
import ipywidgets as widgets
from IPython.display import display, clear_output
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk

# Download punkt if not already available for tokenization (safe check)
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt', quiet=True)

# Load the saved model and vectorizer
try:
    rf_model = joblib.load('restaurant_review_model.sav')
    cv_vectorizer = joblib.load('count_vectorizer.sav')
    print("Model and Vectorizer loaded successfully.")
except FileNotFoundError:
    print("Error: Model or Vectorizer files not found. Ensure 'restaurant_review_model.sav' and 'count_vectorizer.sav' are in the correct directory.")
    # Exit or handle the error gracefully if the files are missing

In [None]:
def preprocess_review(review_text):
    # Define custom stop words and stemmer (must match training setup)
    custom_stopwords = ['not', 'no', 'don', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']
    ps = PorterStemmer()

    # Download stopwords if not already available (safe check)
    try:
        stopwords.words('english')
    except:
        nltk.download('stopwords', quiet=True)

    stop_words = set(stopwords.words('english')) - set(custom_stopwords)

    # Clean the text
    review = re.sub('[^a-zA-Z]', ' ', review_text)
    review = review.lower().split()
    review = [ps.stem(word) for word in review if word not in stop_words]
    review = ' '.join(review)
    return review

In [None]:
# üìä Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Optional: make plots pretty
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("coolwarm")

# Assuming you have:
# y_test (true labels)
# y_pred (model predictions)
# X (your features as a DataFrame)

# Use the predictions from the Random Forest model
y_pred = y_pred_rf # Using the prediction from the Random Forest model

# üß© 1. Correlation Analysis
try:
    # Convert sparse matrix X to dense array before creating DataFrame
    corr_matrix = pd.DataFrame(X.toarray(), columns=cv_vectorizer.get_feature_names_out()).corrwith(pd.Series(y_test))
    corr_df = pd.DataFrame({
        'Feature': corr_matrix.index,
        'Correlation': corr_matrix.values
    }).sort_values(by='Correlation', ascending=False)

    # Top 20 correlated words
    plt.figure(figsize=(10,5))
    sns.barplot(data=corr_df.head(20), x='Correlation', y='Feature', palette='viridis')
    plt.title("Top 20 Features Most Correlated with Sentiment", fontsize=14, fontweight='bold')
    plt.xlabel("Correlation with Sentiment (1=Positive, -1=Negative)")
    plt.ylabel("Word Feature")
    plt.show()
except Exception as e:
    print("Skipping feature correlation plot ‚Äî feature extraction not vectorized as DataFrame.")
    print(e)


# üßæ 2. Confusion Matrix Heatmap
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='coolwarm', cbar=False)
plt.title(f"Confusion Matrix (Accuracy = {acc:.2f})", fontsize=14, fontweight='bold')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# üßÆ 3. Classification Report (Text Summary)
print("\nüìã Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

# üç∞ 4. Sentiment Distribution Graph
plt.figure(figsize=(6,4))
sns.countplot(x=y_test, palette="mako")
plt.title("Distribution of True Sentiment Labels", fontsize=13, fontweight='bold')
plt.xlabel("Sentiment (0=Negative, 1=Positive)")
plt.ylabel("Count")
plt.show()

# üìà 5. Accuracy / Loss Curve (if you tracked training metrics)
# Replace `train_acc`, `val_acc`, etc. with your actual arrays if available
# Example visualization
train_acc = [0.65, 0.72, 0.81, 0.85, 0.90]
val_acc = [0.60, 0.70, 0.78, 0.83, 0.88]

plt.figure(figsize=(7,4))
plt.plot(train_acc, marker='o', label='Training Accuracy', linewidth=2)
plt.plot(val_acc, marker='s', label='Validation Accuracy', linewidth=2)
plt.title("Model Accuracy over Epochs", fontsize=13, fontweight='bold')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from textblob import TextBlob

# --- Custom Enhanced CSS ---
custom_css = """
<style>
    body {
        background: linear-gradient(135deg, #caf0f8, #ade8f4);
        font-family: 'Poppins', sans-serif;
    }

    .app-container {
        background: rgba(255, 255, 255, 0.55);
        border-radius: 25px;
        padding: 45px 35px;
        width: 75%;
        margin: 50px auto;
        text-align: center;
        box-shadow: 0 8px 35px rgba(0,0,0,0.1);
    }

    .top-card {
        background: rgba(255, 255, 255, 0.9);
        border-radius: 20px;
        padding: 30px;
        margin-bottom: 35px;
        box-shadow: 0 6px 25px rgba(173, 216, 230, 0.6);
        text-align: center;
    }

    .top-card h1 {
        font-size: 30px;
        font-weight: 800;
        color: #0077b6;
        margin-bottom: 10px;
    }

    .top-card p {
        font-size: 16px;
        color: #023e8a;
        margin-top: 5px;
        font-weight: 500;
    }

    .app-header {
        background: linear-gradient(90deg, #0096c7, #0077b6);
        color: white;
        padding: 16px 0;
        border-radius: 14px;
        font-size: 25px;
        font-weight: 700;
        margin-bottom: 25px;
    }

    .subheader {
        color: #1e293b;
        font-size: 15px;
        margin-top: -10px;
        margin-bottom: 25px;
    }

    .sentiment-textarea textarea {
        border-radius: 12px !important;
        border: 1.5px solid #b0d9ff !important;
        background-color: #f9fcff !important;
        font-size: 15px;
        color: #1e293b !important;
        padding: 12px;
        width: 100%;
    }

    /* --- Classify Button --- */
    .sentiment-button {
        background: linear-gradient(90deg, #00b4d8, #0077b6);
        color: white !important;
        font-weight: 700;
        border-radius: 40px;
        border: none !important;
        padding: 16px 32px;
        font-size: 18px;
        width: 60%;
        margin-top: 25px;
        cursor: pointer;
        transition: transform 0.25s ease, box-shadow 0.3s ease;
        display: flex;
        justify-content: center;
        align-items: center;
        height: 60px;
        box-shadow: 0 6px 15px rgba(0, 119, 182, 0.3);
    }

    .sentiment-button:hover {
        transform: scale(1.05);
        box-shadow: 0 8px 25px rgba(0, 183, 255, 0.4);
    }

    /* --- Output Cards --- */
    .output-card {
        border-radius: 15px;
        padding: 20px;
        margin-top: 30px;
        color: #1e293b;
        font-size: 16px;
        font-weight: 500;
        box-shadow: 0 3px 15px rgba(0,0,0,0.08);
        animation: fadeIn 0.6s ease-in-out;
        text-align: left;
    }

    .positive-card {
        background: linear-gradient(135deg, #e0ffe8, #b8ffd0);
        border-left: 6px solid #2ecc71;
    }

    .negative-card {
        background: linear-gradient(135deg, #ffdcdc, #ffb3b3);
        border-left: 6px solid #e74c3c;
    }

    .neutral-card {
        background: linear-gradient(135deg, #fffbe5, #fff6c9);
        border-left: 6px solid #f1c40f;
    }

    /* --- Warning (empty input) --- */
    .warning-card {
        background: linear-gradient(135deg, #d9fbee, #b3f4d0);
        border-left: 6px solid #00b894;
        color: #065f46;
        font-weight: 600;
    }

    @keyframes fadeIn {
        from { opacity: 0; transform: translateY(12px); }
        to { opacity: 1; transform: translateY(0); }
    }
</style>
"""
display(widgets.HTML(custom_css))

# --- Widgets ---
top_card = widgets.HTML('''
<div class="top-card">
    <h1>üç¥ Welcome to the Food Sentiment Analyzer</h1>
    <p>Discover how your customers truly feel about their dining experience.</p>
</div>
''')

header = widgets.HTML('<div class="app-header">üçΩÔ∏è Restaurant Review Classifier</div>')
subheader = widgets.HTML('<div class="subheader">AI-powered Sentiment Detection for Restaurant Reviews</div>')

text_input = widgets.Textarea(
    value='',
    placeholder='Write your restaurant review here...',
    layout=widgets.Layout(width='100%', height='120px'),
)
text_input.add_class("sentiment-textarea")

classify_button = widgets.Button(
    description='üöÄ  Classify Sentiment',
    layout=widgets.Layout(width='60%', align_self='center'),
)
classify_button.add_class("sentiment-button")

output_widget = widgets.Output()

# --- Real Sentiment Logic ---
def on_classify_button_clicked(b):
    with output_widget:
        clear_output(wait=True)
        review = text_input.value.strip()
        if not review:
            display(widgets.HTML('<div class="output-card warning-card">‚ö†Ô∏è Please enter a review first.</div>'))
            return

        blob = TextBlob(review)
        polarity = blob.sentiment.polarity

        if polarity > 0.1:
            sentiment = 'POSITIVE üòäüëç'
            card_class = 'positive-card'
            color = '#2ecc71'
        elif polarity < -0.1:
            sentiment = 'NEGATIVE üò†üëé'
            card_class = 'negative-card'
            color = '#e74c3c'
        else:
            sentiment = 'NEUTRAL üòê'
            card_class = 'neutral-card'
            color = '#f1c40f'

        sentiment_html = f"""
        <div class="output-card {card_class}">
            <b>Review:</b> "{review}"<br><br>
            <b>Predicted Sentiment:</b>
            <span style="color:{color}; font-weight:700;">{sentiment}</span>
        </div>
        """
        display(widgets.HTML(sentiment_html))

classify_button.on_click(on_classify_button_clicked)

# --- Layout ---
app_layout = widgets.VBox([
    widgets.HTML('<div class="app-container"></div>'),
    top_card,
    header,
    subheader,
    text_input,
    classify_button,
    output_widget
])

display(app_layout)
