In [110]:
##-----Part B-----## 
##Predictive modelling##

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression as SkLR


import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics




In [112]:
print("scikit-learn version:", sklearn.__version__)
print("PySpark version:", pyspark.__version__)

scikit-learn version: 1.7.1
PySpark version: 4.0.1


In [113]:
df=pd.read_csv("C:/Users/aravi/Downloads/zomato_df_final_data.csv")
print(f"Dataset dimensions: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())


Dataset dimensions: (10500, 17)

First 5 rows:


Unnamed: 0,address,cost,cuisine,lat,link,lng,phone,rating_number,rating_text,subzone,title,type,votes,groupon,color,cost_2,cuisine_color
0,"371A Pitt Street, CBD, Sydney",50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,https://www.zomato.com/sydney/sydney-madang-cbd,151.207605,02 8318 0406,4.0,Very Good,CBD,Sydney Madang,['Casual Dining'],1311.0,False,#e15307,5.243902,#6f706b
1,"Shop 7A, 2 Huntley Street, Alexandria, Sydney",80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,https://www.zomato.com/sydney/the-grounds-of-a...,151.193793,02 9699 2225,4.6,Excellent,"The Grounds of Alexandria, Alexandria",The Grounds of Alexandria Cafe,['Café'],3236.0,False,#9c3203,7.560976,#6f706b
2,"Level G, The Darling at the Star, 80 Pyrmont ...",120.0,['Japanese'],-33.867971,https://www.zomato.com/sydney/sokyo-pyrmont,151.19521,1800 700 700,4.9,Excellent,"The Star, Pyrmont",Sokyo,['Fine Dining'],1227.0,False,#7f2704,10.650407,#6f706b
3,"Sydney Opera House, Bennelong Point, Circular...",270.0,['Modern Australian'],-33.856784,https://www.zomato.com/sydney/bennelong-restau...,151.215297,02 9240 8000,4.9,Excellent,Circular Quay,Bennelong Restaurant,"['Fine Dining', 'Bar']",278.0,False,#7f2704,22.235772,#4186f4
4,"20 Campbell Street, Chinatown, Sydney",55.0,"['Thai', 'Salad']",-33.879035,https://www.zomato.com/sydney/chat-thai-chinatown,151.206409,02 8317 4811,4.5,Excellent,Chinatown,Chat Thai,['Casual Dining'],2150.0,False,#a83703,5.630081,#6f706b


In [114]:
print(df.columns.tolist())

['address', 'cost', 'cuisine', 'lat', 'link', 'lng', 'phone', 'rating_number', 'rating_text', 'subzone', 'title', 'type', 'votes', 'groupon', 'color', 'cost_2', 'cuisine_color']


In [115]:
print("\nData types:")
print(df.dtypes)


Data types:
address           object
cost             float64
cuisine           object
lat              float64
link              object
lng              float64
phone             object
rating_number    float64
rating_text       object
subzone           object
title             object
type              object
votes            float64
groupon             bool
color             object
cost_2           float64
cuisine_color     object
dtype: object


In [116]:
#Question 1

In [117]:
##1.Handling missing values
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
address             0
cost              346
cuisine             0
lat               192
link                0
lng               192
phone               0
rating_number    3316
rating_text      3316
subzone             0
title               0
type               48
votes            3316
groupon             0
color               0
cost_2            346
cuisine_color       0
dtype: int64


In [118]:
#Now we are going to handle the missing the data here

In [119]:
#we impute numeric columns with -1 and categorical values with 'unknown'
for col in df.columns:
     # categorical values
    if df[col].dtype == "object":  
        df[col] = df[col].fillna("Unknown")
    else:            
        # numeric values               
        df[col] = df[col].fillna(-1)

In [120]:
#Now we are dropping values by detecting outliers using the IQR method
Q1 = df['cost'].quantile(0.25)
Q3 = df['cost'].quantile(0.75)
IQR = Q3 - Q1

outliers = df[(df['cost'] < (Q1 - 1.5 * IQR)) | (df['cost'] > (Q3 + 1.5 * IQR))]
print(f"Number of outliers in cost: {len(outliers)}")

Number of outliers in cost: 361


In [121]:
#since we have these many outliers we need to drop them
df = df[~((df['cost'] < (Q1 - 1.5 * IQR)) | (df['cost'] > (Q3 + 1.5 * IQR)))]
print("Dataset after removing outliers:", df.shape)

Dataset after removing outliers: (10139, 17)


In [122]:
##Encoding categorical features##

In [123]:
#label encoding

In [124]:
le = LabelEncoder()

In [125]:
df['rating_text_encoded'] = le.fit_transform(df['rating_text'])
print(dict(zip(le.classes_, le.transform(le.classes_))))

{'Average': np.int64(0), 'Excellent': np.int64(1), 'Good': np.int64(2), 'Poor': np.int64(3), 'Unknown': np.int64(4), 'Very Good': np.int64(5)}


In [126]:
##creating useful features##
##some of the useful features includes, cuisine diversity,cost,votes,ratings in number and texts, subzone etc

In [127]:
#analysing the number of cuisines, the restaurant serves
df['cuisine_count'] = df['cuisine'].apply(lambda x: len(str(x).split(',')))


In [128]:
print(df['cuisine_count'])

0        4
1        4
4        2
7        1
9        2
        ..
10495    1
10496    1
10497    1
10498    1
10499    1
Name: cuisine_count, Length: 10139, dtype: int64


In [129]:
##finding the number of people voted on the basis of voting
df['rating_strength'] = df['rating_number'] * np.log1p(df['votes'])


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [130]:
print(df['rating_strength'])

0        28.717232
1        37.179050
4        34.531597
7        33.368528
9        33.831289
           ...    
10495          inf
10496          inf
10497          inf
10498          inf
10499          inf
Name: rating_strength, Length: 10139, dtype: float64


In [131]:
#finding out the restaurants which are popular and not. 
df['is_popular'] = (df['votes'] > df['votes'].median()).astype(int)


In [132]:
print(df['is_popular'])
#if it shows 1 it is popular and if it show 0 its not popular

0        1
1        1
4        1
7        1
9        1
        ..
10495    0
10496    0
10497    0
10498    0
10499    0
Name: is_popular, Length: 10139, dtype: int64


In [133]:
##finding the cost bins
bins = [0, 50, 100, 200, 400, 800, 1600, float('inf')]
labels = ['Very Low', 'Low', 'Mid-Low', 'Mid', 'Mid-High', 'High', 'Luxury']
df['cost_bin'] = pd.cut(df['cost'], bins=bins, labels=labels, include_lowest=True)


In [134]:
df['cost_bin']

0        Very Low
1             Low
4             Low
7             Low
9             Low
           ...   
10495    Very Low
10496    Very Low
10497    Very Low
10498    Very Low
10499         Low
Name: cost_bin, Length: 10139, dtype: category
Categories (7, object): ['Very Low' < 'Low' < 'Mid-Low' < 'Mid' < 'Mid-High' < 'High' < 'Luxury']

In [135]:
##----2.Regression Models------##

In [136]:
#--Model A-- Linear Regression---#

##Here we are predicting rating_number(target variable) using Scikit-learn
##For this we are using features like cuisine_count, cost, votes etc

In [137]:
# make dummy variables for rating_text (if not already created)
if 'rating_text' in df.columns:
    dummies = pd.get_dummies(df['rating_text'], prefix='rating_text')
    df = pd.concat([df, dummies], axis=1)

In [138]:
# features chosen
feature_cols = ['cost','votes','cuisine_count',
                'rating_text_Excellent','rating_text_Good',
                'rating_text_Poor','rating_text_Very Good']

In [139]:
X = df[feature_cols]
y = df['rating_number']  

In [140]:
# Target variable
y = df['rating_number']

In [141]:
# Step 2: Train-test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [142]:
# Step 3: Train Linear Regression
# -------------------------
model = LinearRegression()
model.fit(X_train, y_train)

AssertionError: 

In [None]:
#making predictions
y_prediction_modelA = modelA.predict(X_test)

In [None]:
print(y_prediction_modelA)

In [None]:
#Calculating Mean squared error(MSE)
mse_modelA = mean_squared_error(y_test, y_prediction_modelA)

In [None]:
print('MSE value for Linear Regression using Scikit-learn is: ',mse_modelA)

In [None]:
## Model B- Gradient Descent Regression - implementing linear regression with gradient descent##

In [None]:
#Here we are using the features implemented for model A as well#

X = df[['cost', 'votes', 'cuisine_count', 
        'rating_text_Excellent', 'rating_text_Good', 
        'rating_text_Poor', 'rating_text_Very Good']].values
y = df['rating_number'].values

In [None]:
 # implementing 80/20 split for train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
##implementing feature scaling needed for gradient descent
##for this we are doing scaler for generalising features needed for gradient descent.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:

X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_test_b = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

In [None]:
#applying gradient descent 

def gradient_descent(X, y, lr=0.01, n_iter=1000):
    m, n = X.shape
    theta = np.zeros(n)  # initialize weights
    for _ in range(n_iter):
        gradients = (1/m) * X.T.dot(X.dot(theta) - y)
        theta -= lr * gradients
    return theta

In [None]:
#training the model
theta = gradient_descent(X_train_b, y_train, lr=0.01, n_iter=1000)

In [None]:
print(theta)

In [None]:
#making predictions

y_prediction_modelB = X_test_b.dot(theta)


In [None]:
#calcuating Mean Squared Value(MSE)
mse_modelB = mean_squared_error(y_test, y_prediction_modelB)

In [None]:
print('MSE value for Gradient descent regression: ',mse_modelB)

In [None]:
### 3. CLASSIFICATION MODELS ###
# here we dont have rating _text column. Here we are are going to have 2 classes.
#class 1 : Poor + Average
#class 2 : Good + Very Good + Excellent

#we set class1 as the default class here
df['rating_binary'] = 1


In [None]:
#We set those having rating as goog or very good or excellent as class 2
df.loc[(df['rating_text_Good'] == 1) | 
       (df['rating_text_Very Good'] == 1) | 
       (df['rating_text_Excellent'] == 1), 'rating_binary'] = 2

In [None]:
print(df['rating_binary'].value_counts())

In [None]:
## Applying logistic regression keeping rating_binary as target variable

X = df[['cost', 'votes', 'cuisine_count']]   
y = df['rating_binary']       

In [None]:
#we are doing train-test split for (80/20) here
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
#implementing training for logistic regression
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)


In [None]:
#making predictions
y_prediction = logistic_regression_model.predict(X_test)

In [None]:
#evaluation with confusion matrix which shows how many restaurants were classified properly
confusion_matrix = confusion_matrix(y_test, y_prediction)


In [None]:
print("Confusion Matrix: ",confusion_matrix)

In [None]:
#Evaluation using precision which depicts the to what extent the predicition of classification was proper
precision = precision_score(y_test, y_prediction, pos_label=2)

In [None]:
print("PRECISION : ",precision)

In [None]:
#Evaluation for recall which explains how much the model correctly identified.
recall = recall_score(y_test, y_prediction, pos_label=2)

In [None]:
print("Recall: ", recall)

In [None]:
#Evaluation for F1 which explains how balanced, the values of precision and recall are
f1_Score = f1_score(y_test, y_pred, pos_label=2)



In [None]:
print("F1 Score : ",f1_Score)

In [None]:
## TRAININIG 3 MORE MODELS##

#Initiailizing a variable to store the results of each model and show it as a table

results_models = {}

In [None]:
##Random Forest model
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred_random_forest = random_forest_model.predict(X_test)

In [None]:
results_models['Random Forest'] = {
    'Confusion Matrix': confusion_matrix(y_test, y_pred_random_forest).tolist(),
    'Precision': round(precision_score(y_test, y_pred_random_forest, pos_label=2), 3),
    'Recall': round(recall_score(y_test, y_pred_random_forest, pos_label=2), 3),
    'F1': round(f1_score(y_test, y_pred_random_forest, pos_label=2), 3)
}

In [None]:
print(results_models['Random Forest'])

In [None]:
###Gradient Boosted trees##
gradient_boost_model = GradientBoostingClassifier(random_state=42)
gradient_boost_model.fit(X_train, y_train)
y_pred_gradient_boost = gradient_boost_model.predict(X_test)

In [None]:
results_models['Gradient Boosted Trees'] = {
    'Confusion Matrix': confusion_matrix(y_test, y_pred_gradient_boost).tolist(),
    'Precision': round(precision_score(y_test, y_pred_gradient_boost, pos_label=2), 3),
    'Recall': round(recall_score(y_test, y_pred_gradient_boost, pos_label=2), 3),
    'F1': round(f1_score(y_test, y_pred_gradient_boost, pos_label=2), 3)
}

In [None]:
print(results_models['Gradient Boosted Trees'])

In [None]:
### SVM Model ###
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm_model= svm_model.predict(X_test)

In [None]:
results_models['SVM'] = {
    'Confusion Matrix': confusion_matrix(y_test, y_pred_svm_model).tolist(),
    'Precision': round(precision_score(y_test, y_pred_svm_model, pos_label=2), 3),
    'Recall': round(recall_score(y_test, y_pred_svm_model, pos_label=2), 3),
    'F1': round(f1_score(y_test, y_pred_svm_model, pos_label=2), 3)
}

In [None]:
print(results_models['SVM'] )

In [None]:
## converting the data we got into a table form for comparison

results_data_frame = pd.DataFrame(results_models).T
print(results_data_frame)

In [None]:
from pyspark.sql import SparkSession

In [None]:
from pyspark.sql import SparkSession

# Start Spark session for this notebook
spark = SparkSession.builder.appName("Zomato-MLlib").getOrCreate()
print("Spark started ✓")

In [None]:
#first we are bringing the data to spark
df_spark = spark.createDataFrame(df)
