# 📡 NYC Wi-Fi Hotspot Regression Project

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error


## 📥 Load the Dataset

In [None]:
wifi_df = pd.read_csv("NYC_Wifi_Data_Dictionary.csv")
wifi_df.head()

Unnamed: 0,OBJECTID,Borough,Type,Provider,Name,Location,Latitude,Longitude,X,Y,...,Neighborhood Tabulation Area (NTA),Council Distrcit,Postcode,BoroCD,Census Tract,BCTCB2010,BIN,BBL,DOITT_ID,"Location (Lat, Long)"
0,10604,4,Limited Free,SPECTRUM,Baisley Pond Park,Park Perimeter,40.67486,-73.78412,1044132.0,185219.892077,...,Springfield Gardens North,28,11434,412,294,294,0,0,1408,"(40.6748599999, -73.7841200005)"
1,10555,4,Limited Free,SPECTRUM,Kissena Park,Park Perimeter,40.74756,-73.81815,1034638.0,211685.217755,...,Flushing,20,11355,407,845,845,0,0,1359,"(40.7475599996, -73.8181499997)"
2,12370,3,Free,Transit Wireless,Grand St (L),Grand St (L),40.711926,-73.94067,1000698.0,198655.90884,...,East Williamsburg,34,11206,301,495,495,0,0,1699,"(40.7119259997, -73.9406699994)"
3,9893,3,Free,Downtown Brooklyn,,125 Court St.,40.689985,-73.991995,986470.0,190656.680416,...,Brooklyn Heights-Cobble Hill,33,11201,302,9,9,3388736,3002777501,298,"(40.6899850001, -73.9919950004)"
4,10169,1,Free,Transit Wireless,Lexington Av-63 St (F),Lexington Av-63 St (F),40.76463,-73.966115,993636.6,217853.888161,...,Upper East Side-Carnegie Hill,4,10065,108,120,120,0,0,599,"(40.7646300002, -73.9661150001)"


## 🧹 Preprocessing & Feature Engineering

In [None]:
columns_to_drop = [
    'OBJECTID', 'Location (Lat, Long)', 'BIN', 'BBL', 'DOITT_ID', 'Activated',
    'BCTCB2010', 'BoroCD', 'BoroCode', 'Census Tract'
]
wifi_cleaned = wifi_df.drop(columns=columns_to_drop)
wifi_cleaned.rename(columns={'Borough Name': 'Borough', 'Type': 'WiFi_Type'}, inplace=True)
wifi_cleaned = wifi_cleaned.dropna(subset=['WiFi_Type', 'Provider', 'Location'])

categorical_cols = ['Borough', 'City', 'WiFi_Type', 'Provider', 'Location']
for col in categorical_cols:
    wifi_cleaned[col] = wifi_cleaned[col].astype('category')

wifi_encoded = pd.get_dummies(wifi_cleaned, columns=categorical_cols, drop_first=True)


## 🎯 Define Features and Target Variable

In [None]:
target = 'Latitude'
cols_to_remove = ['Longitude', 'X', 'Y'] + [col for col in wifi_encoded.columns if col.startswith('Location_')]
features = wifi_encoded.drop(columns=cols_to_remove + [target])
X = features.select_dtypes(include=[np.number])
y = wifi_encoded[target]


## 🤖 Model Training and Evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("R² Score:", r2)
print("RMSE:", rmse)


R² Score: 0.5135468923994284
RMSE: 0.04729884331786411


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Load dataset
wifi_df = pd.read_csv("NYC_Wifi_Data_Dictionary.csv")
wifi_df.rename(columns={'Borough Name': 'Borough', 'Type': 'WiFi_Type'}, inplace=True)

# Extract Borough Name from embedded DataFrame column
if isinstance(wifi_df['Borough'].iloc[0], (list, pd.Series, pd.DataFrame)):
    wifi_df['Borough_Name'] = wifi_df['Borough'].iloc[:, 1]
else:
    wifi_df['Borough_Name'] = wifi_df['Borough']


## 🔹 Wi-Fi Hotspots by Borough

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=wifi_df, x='Borough_Name', order=wifi_df['Borough_Name'].value_counts().index)
plt.title("Number of Wi-Fi Hotspots by Borough")
plt.xlabel("Borough")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 🔹 Wi-Fi Type Distribution

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(data=wifi_df, x='WiFi_Type', order=wifi_df['WiFi_Type'].value_counts().index)
plt.title("Distribution of Wi-Fi Types in NYC")
plt.xlabel("Wi-Fi Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 🔹 Correlation of Features with Latitude

In [None]:
import numpy as np

# One-hot encode categorical columns to access numeric correlations
categorical_cols = ['Borough', 'City', 'WiFi_Type', 'Provider', 'Location']
for col in categorical_cols:
    wifi_df[col] = wifi_df[col].astype('category')
wifi_encoded = pd.get_dummies(wifi_df, columns=categorical_cols, drop_first=True)

# Correlation heatmap
numeric_cols = wifi_encoded.select_dtypes(include=[np.number])
corr_matrix = numeric_cols.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix[['Latitude']].sort_values(by='Latitude', ascending=False), annot=True, cmap='coolwarm')
plt.title("Correlation of Features with Latitude")
plt.tight_layout()
plt.show()


## 🔹 Boxplot: Latitude by Borough

In [None]:
plt.figure(figsize=(9, 6))
sns.boxplot(data=wifi_df, x='Borough_Name', y='Latitude')
plt.title("Distribution of Wi-Fi Latitude by Borough")
plt.xlabel("Borough")
plt.ylabel("Latitude")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
