In [None]:
# This cell is intentionally left blank. See the new notebook structure below.

In [None]:
# This cell is intentionally left blank. See the new notebook structure below.

# Predictive Modelling of Eating-Out Problem

## Assignment Overview

This notebook addresses the requirements for the Predictive Modelling of Eating-Out Problem assignment. We will perform exploratory data analysis (EDA), build regression and classification models using Scikit-Learn and PySpark MLlib, and ensure reproducibility using Git, Git LFS, and DVC. The dataset is 'zomato_df_final_data.csv' with restaurant details from Sydney (2018), and 'sydney.geojson' for geospatial analysis.



### Objectives

- Perform EDA and visualize key insights.

- Build and evaluate regression and classification models.

- Use PySpark MLlib for scalable modeling.

- Ensure reproducibility with Git, Git LFS, and DVC.



**Note**: Run this notebook in an environment with required libraries installed (see requirements.txt). Replace file paths if necessary.

## Deliverables
- This notebook with code and outputs.
- PDF report summarizing findings (export this notebook to PDF).
- GitHub link: <insert_link>

### Reflection
PySpark is better for scalability but Scikit-Learn is simpler for small data. Models perform reasonably; Random Forest had best F1.

### Git & GitHub
Initialize Git:
```
git init
git lfs install
git lfs track "*.csv" "*.geojson" "*.pkl"
git add .
git commit -m "Initial commit"
git remote add origin <github_repo_url>
git push -u origin main
```

### README.md
- Install dependencies: pip install -r requirements.txt
- Run: jupyter notebook EatingOut_Analysis.ipynb
- Expected: EDA plots, model MSE/F1 scores.

### Data Version Control (DVC)
```
dvc init
dvc add zomato_df_final_data.csv sydney.geojson
dvc remote add -d storage <remote>
dvc push

# dvc.yaml
stages:
  preprocess:
    cmd: python preprocess.py
    deps:
      - zomato_df_final_data.csv
    outs:
      - processed_data.csv
  model:
    cmd: python model.py
    deps:
      - processed_data.csv
    outs:
      - model.pkl
  evaluate:
    cmd: python evaluate.py
    deps:
      - model.pkl
      - processed_data.csv
```
Run: dvc repro

## Part C – Reproducibility and Workflow (10 marks)

In [None]:
# Start Spark session

spark = SparkSession.builder.appName('EatingOut').getOrCreate()

spark_df = spark.createDataFrame(df)



# Features

assembler = VectorAssembler(inputCols=['cost', 'votes', 'subzone_encoded', 'cuisine_diversity'], outputCol='features')



# Regression with PySpark

lr_spark = SparkLinearRegression(featuresCol='features', labelCol='rating_number')

pipeline_reg = SparkPipeline(stages=[assembler, lr_spark])

train, test = spark_df.randomSplit([0.8, 0.2])

model_reg = pipeline_reg.fit(train)

pred_reg = model_reg.transform(test)

evaluator = RegressionEvaluator(labelCol='rating_number', metricName='mse')

mse_spark = evaluator.evaluate(pred_reg)

print('Spark MSE:', mse_spark)



# Classification with PySpark

log_reg_spark = SparkLogisticRegression(featuresCol='features', labelCol='class')

pipeline_class = SparkPipeline(stages=[assembler, log_reg_spark])

model_class = pipeline_class.fit(train)

pred_class = model_class.transform(test)

evaluator_class = MulticlassClassificationEvaluator(labelCol='class', metricName='f1')

f1_spark = evaluator_class.evaluate(pred_class)

print('Spark F1:', f1_spark)



# Comparison: PySpark is more scalable for large data, but similar accuracy; Scikit-Learn is faster for small datasets.

### 4. PySpark Models

In [None]:
# Simplify rating_text to binary

df['class'] = df['rating_text'].apply(lambda x: 1 if x in ['Poor', 'Average'] else 2 if x in ['Good', 'Very Good', 'Excellent'] else 0)

y_class = df['class']



# Split

X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)



# Logistic Regression

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

y_pred_log = log_reg.predict(X_test)

print(confusion_matrix(y_test, y_pred_log))

print('Precision:', precision_score(y_test, y_pred_log, average='weighted'))

print('Recall:', recall_score(y_test, y_pred_log, average='weighted'))

print('F1:', f1_score(y_test, y_pred_log, average='weighted'))



# Other models

models = {

    'Random Forest': RandomForestClassifier(),

    'Gradient Boosted Trees': GradientBoostingClassifier(),

    'SVM': SVC(),

    'Neural Net': MLPClassifier()

}



results = []

for name, model in models.items():

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    results.append({

        'Model': name,

        'Precision': precision_score(y_test, y_pred, average='weighted'),

        'Recall': recall_score(y_test, y_pred, average='weighted'),

        'F1': f1_score(y_test, y_pred, average='weighted')

    })



# Compare in table

results_df = pd.DataFrame(results)

print(results_df)

### 3. Classification Models

In [None]:
# Features and target

X = df[['cost', 'votes', 'subzone_encoded', 'cuisine_diversity']]

y_reg = df['rating_number']



# Split

X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=42)



# Model A: Linear Regression

lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

mse_lr = mean_squared_error(y_test, y_pred_lr)

print('MSE Linear Regression:', mse_lr)



# Model B: Gradient Descent Regression (using SGDRegressor)

from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor(max_iter=1000, tol=1e-3)

sgd.fit(X_train, y_train)

y_pred_sgd = sgd.predict(X_test)

mse_sgd = mean_squared_error(y_test, y_pred_sgd)

print('MSE Gradient Descent:', mse_sgd)

### 2. Regression Models

In [None]:
# Handle missing values: drop rows with missing target, impute others

df = df.dropna(subset=['rating_number', 'rating_text'])

df['cost'] = df['cost'].fillna(df['cost'].median())

df['votes'] = df['votes'].fillna(df['votes'].median())



# Justify: Dropping missing targets as they are essential; median impute for numeric to avoid bias.



# Encode categorical

le = LabelEncoder()

df['subzone_encoded'] = le.fit_transform(df['subzone'].astype(str))



# Create features: cuisine diversity

df['cuisine_diversity'] = df['cuisine'].apply(len)

df['cost_bin'] = pd.cut(df['cost'], bins=[0, 50, 100, 150, np.inf], labels=['Low', 'Medium', 'High', 'Very High'])



# One-hot for cost_bin

df = pd.get_dummies(df, columns=['cost_bin'])

### 1. Feature Engineering

## Part B – Predictive Modelling (20 marks)

In [None]:
# Interactive scatter plot for cost vs rating

fig = px.scatter(df, x='cost', y='rating_number', color='rating_text', hover_data=['subzone'])

fig.show()



# Explanation: Interactive version allows hovering for details and zooming, better than static for exploring outliers.

### Interactive Visualisation

In [None]:
# Load geojson

gdf = gpd.read_file('sydney.geojson')



# Choose a cuisine, e.g., 'Japanese'

df['has_japanese'] = df['cuisine'].apply(lambda x: 1 if 'Japanese' in x else 0)

suburb_counts = df.groupby('subzone')['has_japanese'].sum().reset_index()

suburb_counts.columns = ['SSC_NAME', 'count']



# Merge with geo

merged = gdf.merge(suburb_counts, on='SSC_NAME', how='left').fillna(0)



# Plot

merged.plot(column='count', cmap='OrRd', legend=True)

plt.title('Japanese Cuisine Density per Suburb')

plt.show()

### Geospatial Analysis

In [None]:
# Distribution of cost, ratings, restaurant types

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.histplot(df['cost'], ax=axes[0])

axes[0].set_title('Cost Distribution')

sns.histplot(df['rating_number'], ax=axes[1])

axes[1].set_title('Rating Distribution')

df['type'] = df['type'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

all_types = [t for sublist in df['type'] for t in sublist]

sns.countplot(y=all_types, ax=axes[2])

axes[2].set_title('Restaurant Types')

plt.show()



# Correlation between cost and votes

sns.scatterplot(x='cost', y='votes', data=df)

plt.title('Correlation between Cost and Votes')

plt.show()

print('Correlation:', df['cost'].corr(df['votes']))



# Insights: Higher cost restaurants tend to have more votes, indicating popularity. Casual Dining is the most common type.

### Explore Key Variables

In [None]:
# Parse cuisine lists

df['cuisine'] = df['cuisine'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])



# How many unique cuisines?

all_cuisines = [c for sublist in df['cuisine'] for c in sublist]

unique_cuisines = set(all_cuisines)

print(f'Number of unique cuisines: {len(unique_cuisines)}')



# Top 3 suburbs with most restaurants

top_suburbs = df['subzone'].value_counts().head(3)

print('Top 3 suburbs:', top_suburbs)



# Plot top suburbs

sns.barplot(x=top_suburbs.index, y=top_suburbs.values)

plt.title('Top 3 Suburbs with Most Restaurants')

plt.show()



# Are 'Excellent' ratings more expensive than 'Poor'?

excellent_cost = df[df['rating_text'] == 'Excellent']['cost']

poor_cost = df[df['rating_text'] == 'Poor']['cost']



fig, ax = plt.subplots(1, 2, figsize=(12, 5))

sns.histplot(excellent_cost, ax=ax[0])

ax[0].set_title('Cost Distribution for Excellent Ratings')

sns.histplot(poor_cost, ax=ax[1])

ax[1].set_title('Cost Distribution for Poor Ratings')

plt.show()



# Insights: Excellent rated restaurants tend to have higher average costs than Poor rated ones, as seen in the histograms.

### Answer Specific Questions with Plots

In [None]:
# Load the dataset

df = pd.read_csv('zomato_df_final_data.csv')



# Check missing values, data types, and summary statistics

print(df.info())

print(df.describe())

print(df.isnull().sum())



# Insights: The dataset has over 10,000 records with some missing values in rating_number, lat, lng, etc. We will handle them in feature engineering.

### Load and Explore the Dataset

## Part A – Exploratory Data Analysis (20 marks)

In [None]:
!pip install pandas matplotlib seaborn geopandas plotly scikit-learn pyspark dvc



import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import geopandas as gpd

import plotly.express as px

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.metrics import mean_squared_error, confusion_matrix, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.svm import SVC

from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder as SparkOneHotEncoder

from pyspark.ml.regression import LinearRegression as SparkLinearRegression

from pyspark.ml.classification import LogisticRegression as SparkLogisticRegression

from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator

from pyspark.ml import Pipeline as SparkPipeline

import ast

import os

## Setup
Install required libraries if needed and import them.