In [91]:
import warnings
warnings.filterwarnings("ignore")

In [92]:
import numpy as np
import pandas as pd
import geopandas
from shapely.geometry import Point

In [93]:
df_full = pd.read_csv("data/zomato_df_final_data.csv")

In [94]:
gdf = geopandas.read_file("data/sydney.geojson")

In [95]:
# haven't decided what to do with cuisine
df = df_full.drop(columns = ["address", "link", "phone", "title"], axis = 1)

In [96]:
df.dtypes

cost             float64
cuisine           object
lat              float64
lng              float64
rating_number    float64
rating_text       object
subzone           object
type              object
votes            float64
groupon             bool
color             object
cost_2           float64
cuisine_color     object
dtype: object

In [97]:
# for regression 1 we wont use rating_text
df = df.drop(columns = ["rating_text"], axis = 1)

In [98]:
len(df.cuisine_color.unique())
# not sure why we have this

5

In [99]:
# do some feature engineering
df.groupon = df.groupon.astype(int)

In [100]:
df = df.drop(columns = ["color", "cuisine_color"], axis = 1)

In [101]:
df.isna().sum()

cost              346
cuisine             0
lat               192
lng               192
rating_number    3316
subzone             0
type               48
votes            3316
groupon             0
cost_2            346
dtype: int64

In [102]:
# bringing address to see in helps
df["address"] = df_full["address"]

In [103]:
temp = df[df.lat.isna()]
temp.address

156       Level 4 & 5, Citymark, 683-689 George Street,...
317       Level 3, North End, QVB, 455 George Street, \...
438       The Cannery, Shop 1/34 Morley Avenue, Roseber...
468       Shop 5, 302 Church Street (enter via Phillip ...
573                    Lower Ground, 55 York Street, \tCBD
                               ...                        
10387     Ground Level, Forestville Centre, Starkey Str...
10477     Food Court, Greenway Plaza, 210 Church Street...
10483                  14 Bunlunga Lane, Mona Vale, Sydney
10491     Shop 2, Coogee Village Shopping Centre, 184 C...
10499            136 Sailors Bay Road, Northbridge, Sydney
Name: address, Length: 192, dtype: object

In [104]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

In [105]:
# i = 0
# for address in temp.address : 
#     print(f"{i}. address : {address}")
#     i += 1
#     try : 
#         geolocator = Nominatim(user_agent="address_to_ssc_code")
#         location = geolocator.geocode(address)
#         if location:
#             latitude, longitude = location.latitude, location.longitude
#             print(f"Latitude: {latitude}, Longitude: {longitude}")
#         else:
#             print("Address not found.")
#     except GeocoderTimedOut:
#         print("Geocoding service timed out. Address not found.")
#     except Exception as e:
#         print(f"An error occurred: Max retries exceeded")

In [106]:
# missing_suburbs = temp.address.apply(lambda x : x.split(",")[-1].strip())
# missing_suburbs = [suburb.strip() for suburb in missing_suburbs]

In [107]:
# missing_suburbs.unique()

In [108]:
# df.loc[df['lat'].isna(), "address"] = df['address'].str.split().str[-1].str.strip()


In [109]:
# subzone is not suburb or at leats it's not correct for 1359 entries
# suburb name dont have comma
len(df[df.subzone.str.contains(",")])

1359

In [110]:
df['subzone'] = df['subzone'].apply(lambda x: x.split(',')[-1].strip() if ',' in x else x)


In [111]:
len(df_full.subzone.unique())

572

In [112]:
len(df.subzone.unique())

339

In [128]:
df.isna().sum()

cost              0
cuisine           0
rating_number     0
subzone           0
type             21
votes             0
groupon           0
cost_2            0
dtype: int64

In [114]:
df = df.drop(columns = ["lat", "lng", "address"])

In [115]:
# will drop row if rating is missing for now

In [116]:
df.dropna(subset=['rating_number'], inplace=True)

In [118]:
df[df.cost.isna()]

Unnamed: 0,cost,cuisine,rating_number,subzone,type,votes,groupon,cost_2
396,,['Japanese'],3.9,CBD,['Casual Dining'],44.0,0,
793,,"['Chinese', 'Sichuan']",3.8,Chinatown,['Casual Dining'],40.0,0,
918,,"['Sushi', 'Japanese']",3.5,Crows Nest,['Casual Dining'],18.0,0,
983,,['Korean BBQ'],3.6,Lidcombe,['Casual Dining'],56.0,0,
1042,,"['Japanese', 'Tea']",3.4,Redfern,['Casual Dining'],29.0,0,
...,...,...,...,...,...,...,...,...
8653,,['Chinese'],3.0,Eastwood,['Casual Dining'],14.0,0,
8733,,"['Indian', 'Pakistani']",2.8,Liverpool,['Casual Dining'],6.0,0,
8865,,['Modern Australian'],2.7,Liverpool,['Café'],6.0,0,
8955,,['Chinese'],2.7,Frenchs Forest,['Casual Dining'],8.0,0,


In [126]:
df['cost'] = df.groupby('subzone')['cost'].transform(lambda x: x.fillna(x.mean()))
df['cost_2'] = df.groupby('subzone')['cost_2'].transform(lambda x: x.fillna(x.mean()))

In [127]:
df.loc[793]

cost                          51.441718
cuisine          ['Chinese', 'Sichuan']
rating_number                       3.8
subzone                       Chinatown
type                  ['Casual Dining']
votes                              40.0
groupon                               0
cost_2                         5.355255
Name: 793, dtype: object

In [129]:
df = df.drop(columns = ["type", "cuisine"])

In [131]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['subzone_encoded'] = label_encoder.fit_transform(df['subzone'])
df.drop(columns = ["subzone"], axis = 1)


cost             float64
rating_number    float64
subzone           object
votes            float64
groupon            int64
cost_2           float64
dtype: object

In [130]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# X,y
X = df.drop(columns=['rating_number'])  # Features
y = df['rating_number']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model_regression_1 = LinearRegression()
model_regression_1.fit(X_train, y_train)

# predictions
y_pred = model_regression_1.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


ValueError: could not convert string to float: 'Willoughby'