In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score
import statsmodels.api as sm
import sqlite3

In [2]:
# Load the dataset
zomato_data = pd.read_csv('Zomato Dataset.csv')

In [None]:
# Print the first and last 10 rows of the dataset
print(zomato_data.head(10))
print(zomato_data.tail(10))

In [None]:
# Check the shape of the dataset
print("Shape of the dataset:", zomato_data.shape)

In [None]:
# Display data types and basic info
print("Data Types and Info:")
print(zomato_data.dtypes)

In [None]:
# Print the column names
print("Column Names:")
print(zomato_data.columns)

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = zomato_data.isnull().sum()
print(missing_values[missing_values > 0])

In [8]:
# Drop missing values that are less than 1% of the data
zomato_data.dropna(inplace=True)

In [None]:
# Check for duplicates and remove them
print("Duplicate Rows:", zomato_data.duplicated().sum())
zomato_data = zomato_data.drop_duplicates()

In [None]:
# Summary statistics for numerical columns
print("Summary Statistics for Numerical Columns:")
print(zomato_data.describe())

In [11]:
# Handle missing values with specific strategies
zomato_data['Delivery_person_Age'].fillna(zomato_data['Delivery_person_Age'].mean(), inplace=True)
zomato_data['Delivery_person_Ratings'].fillna(zomato_data['Delivery_person_Ratings'].mean(), inplace=True)
zomato_data['multiple_deliveries'].fillna(1, inplace=True)
zomato_data.fillna('Unknown', inplace=True)

In [None]:
zomato_data.isnull().sum()

In [13]:
# Feature Engineering: Calculate approximate Distance between Restaurant and Delivery Location (Euclidean distance)
zomato_data['Distance_km'] = np.sqrt(
    (zomato_data['Restaurant_latitude'] - zomato_data['Delivery_location_latitude'])**2 +
    (zomato_data['Restaurant_longitude'] - zomato_data['Delivery_location_longitude'])**2
)

In [14]:
# Convert Order Date to datetime
zomato_data['Order_Date'] = pd.to_datetime(zomato_data['Order_Date'], format='%d-%m-%Y')

In [None]:
# Clean column names for easier SQL handling
zomato_data.columns = zomato_data.columns.str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


In [None]:
# SQL PART: Store cleaned data into an SQLite database in the specified folder
conn = sqlite3.connect(r"C:\Users\amrut\Zomato Prediction Project\SQL\zomato.db")  # Use raw string (r) to handle the backslashes in Windows paths
zomato_data.to_sql('zomato_data', conn, if_exists='replace', index=False)

# Check if the data was successfully saved
query = "SELECT * FROM zomato_data LIMIT 5;"
df_sql = pd.read_sql(query, conn)
print(df_sql)

# Close the database connection
conn.close()


In [None]:
# Exploratory Data Analysis (EDA)
# a. Distribution of Delivery Time
plt.figure(figsize=(6,4))
sns.histplot(zomato_data['Time_taken_min'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Delivery Time (min)')
plt.show()

In [None]:
# b. Distribution of Delivery Person Age
plt.figure(figsize=(6,4))
sns.histplot(zomato_data['Delivery_person_Age'], bins=20, kde=True, color='lightgreen')
plt.title('Distribution of Delivery Person Age')
plt.show()

In [None]:
# c. Categorical Variables: Weather and Traffic
plt.figure(figsize=(10,6))
sns.countplot(x='Weather_conditions', data=zomato_data, palette='Set2')
plt.title('Deliveries by Weather Conditions')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Road_traffic_density', data=zomato_data, palette='Set1')
plt.title('Deliveries by Traffic Density')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10,8))
corr_matrix = zomato_data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [22]:
# Data Analytics on Different Cases
# 1. Delivery Person Performance Analysis
zomato_data_performance = zomato_data.groupby('Delivery_person_ID').agg({
    'Delivery_person_Ratings': 'mean',
    'Time_taken_min': 'mean'
}).reset_index()


In [None]:
zomato_data_performance = zomato_data_performance.sort_values(by=['Time_taken_min', 'Delivery_person_Ratings'], ascending=[True, False])
print("Top 5 Delivery Person Performance:")
print(zomato_data_performance.head())


In [None]:
# 2. Multiple Deliveries Impact
plt.figure(figsize=(10,6))
sns.barplot(x='multiple_deliveries', y='Time_taken_min', data=zomato_data, palette='coolwarm')
plt.title('Impact of Multiple Deliveries on Time Taken')
plt.show()

In [None]:
# 3. Demand Forecasting
# Convert 'Order_Date' to datetime format
zomato_data['Order_Date'] = pd.to_datetime(zomato_data['Order_Date'], errors='coerce')

# Group by 'Order_Date' and count the number of deliveries
zomato_data_demand = zomato_data.groupby('Order_Date').size().reset_index(name='Deliveries')

# Check the result
print(zomato_data_demand.head())





In [None]:
# Plot the demand over time
plt.figure(figsize=(10,6))
sns.lineplot(x='Order_Date', y='Deliveries', data=zomato_data_demand, marker='o')
plt.title('Daily Delivery Demand Forecasting')
plt.xticks(rotation=45)
plt.xlabel('Order Date')
plt.ylabel('Number of Deliveries')
plt.show()


In [None]:
# 4. Impact of Weather and Traffic on Delivery Times
plt.figure(figsize=(10,6))
sns.barplot(x='Weather_conditions', y='Time_taken_min', data=zomato_data, palette='Set2')
plt.title('Impact of Weather on Delivery Time')
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x='Road_traffic_density', y='Time_taken_min', data=zomato_data, palette='Set1')
plt.title('Impact of Traffic on Delivery Time')
plt.show()


In [29]:
# 9. CITY-WISE PERFORMANCE ANALYSIS

zomato_data_city_performance = zomato_data.groupby('City').agg({
    'Time_taken_min': 'mean',
    'Delivery_person_Ratings': 'mean'
}).reset_index()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x='Time_taken_min', y='City', data=zomato_data_city_performance, palette='Set3')
plt.title('City-wise Average Delivery Time')
plt.xlabel('Average Time Taken (min)')
plt.ylabel('City')
plt.show()

In [None]:

# 10. DELIVERY VEHICLE CONDITION IMPACT

plt.figure(figsize=(10,6))
sns.barplot(x='Vehicle_condition', y='Time_taken_min', data=zomato_data, palette='Set1')
plt.title('Impact of Vehicle Condition on Delivery Time')
plt.ylabel('Average Time Taken (min)')
plt.show()

print(zomato_data.groupby('Vehicle_condition')['Time_taken_min'].mean())

In [None]:
# 11. FESTIVAL IMPACT ON DELIVERY EFFICIENCY

plt.figure(figsize=(10,6))
sns.barplot(x='Festival', y='Time_taken_min', data=zomato_data, palette='Set2')
plt.title('Festival Impact on Delivery Time')
plt.ylabel('Average Time Taken (min)')
plt.show()

print(zomato_data.groupby('Festival')['Time_taken_min'].mean())

In [33]:
# Close the database connection
conn.close()

In [34]:
# Delivery Time Prediction

# Select features and target variable
X = zomato_data[['Delivery_person_Age', 'Delivery_person_Ratings', 'Weather_conditions', 
                 'Road_traffic_density', 'Vehicle_condition', 'multiple_deliveries', 'City']]
y = zomato_data['Time_taken_min']

In [35]:
# One-hot encode categorical variables
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[['Weather_conditions', 'Road_traffic_density', 'City']]))
X_encoded.columns = encoder.get_feature_names_out(['Weather_conditions', 'Road_traffic_density', 'City'])

In [36]:
# Combine encoded features with numerical columns
X = X.drop(columns=['Weather_conditions', 'Road_traffic_density', 'City'])
X = pd.concat([X, X_encoded], axis=1)

In [None]:
print("Number of missing values in X:", X.isnull().sum().sum())
print("Number of missing values in y:", y.isnull().sum())


In [38]:
# Drop missing values in X and y
X = X.dropna()
y = y[X.index]  # Align y with X by using the same index


In [None]:
print(f"Length of X: {len(X)}")
print(f"Length of y: {len(y)}")


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Model 1: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print(f"Linear Regression MAE: {mean_absolute_error(y_test, y_pred_lr)}")
print(f"Linear Regression R^2: {r2_score(y_test, y_pred_lr)}")

In [None]:
# Model 2: Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(f"Random Forest MAE: {mean_absolute_error(y_test, y_pred_rf)}")
print(f"Random Forest R^2: {r2_score(y_test, y_pred_rf)}")

In [43]:
# Model 3: Decision Tree Classifier (if using classification task)
# For example, we can predict whether delivery will be fast/slow based on certain thresholds
zomato_data['Fast_Delivery'] = zomato_data['Time_taken_min'] <= zomato_data['Time_taken_min'].median()
X_classification = zomato_data[['Delivery_person_Age', 'Delivery_person_Ratings', 'Weather_conditions', 
                                'Road_traffic_density', 'Vehicle_condition', 'multiple_deliveries', 'City']]
y_classification = zomato_data['Fast_Delivery']

In [44]:
# One-hot encoding as before
X_encoded_class = pd.DataFrame(encoder.fit_transform(X_classification[['Weather_conditions', 'Road_traffic_density', 'City']]))
X_encoded_class.columns = encoder.get_feature_names_out(['Weather_conditions', 'Road_traffic_density', 'City'])
X_classification = X_classification.drop(columns=['Weather_conditions', 'Road_traffic_density', 'City'])
X_classification = pd.concat([X_classification, X_encoded_class], axis=1)

In [None]:
print("Length of X_classification:", len(X_classification))
print("Length of y_classification:", len(y_classification))


In [None]:
# One-hot encoding as before
X_encoded_class = pd.DataFrame(encoder.fit_transform(X_classification[['Weather_conditions', 'Road_traffic_density', 'City']]))
X_encoded_class.columns = encoder.get_feature_names_out(['Weather_conditions', 'Road_traffic_density', 'City'])
X_classification = X_classification.drop(columns=['Weather_conditions', 'Road_traffic_density', 'City'])
X_classification = pd.concat([X_classification, X_encoded_class], axis=1)

In [46]:
# Drop missing values in X_classification
X_classification = X_classification.dropna()

# Ensure that y_classification is aligned with X_classification's index
y_classification = y_classification[X_classification.index]


In [None]:
print(f"Length of X_classification after cleaning: {len(X_classification)}")
print(f"Length of y_classification after cleaning: {len(y_classification)}")


In [48]:
# Train-test split for classification
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)

In [None]:
# Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_class, y_train_class)
y_pred_dt = dt_model.predict(X_test_class)
print(f"Decision Tree Accuracy: {accuracy_score(y_test_class, y_pred_dt)}")

In [None]:
# Model Evaluation
print("Linear Regression - MAE:", mean_absolute_error(y_test, y_pred_lr))
print("Random Forest - MAE:", mean_absolute_error(y_test, y_pred_rf))
print("Decision Tree - Accuracy:", accuracy_score(y_test.astype('int'), y_pred_dt))

# Grid Search for best parameters on RandomForest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None]
}

# grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
print(f"Best Random Forest MAE: {mean_absolute_error(y_test, y_pred_best_rf)}")
print(f"Best Random Forest R^2: {r2_score(y_test, y_pred_best_rf)}")

In [51]:
zomato_data.to_csv('zomato_cleaned_dataset.csv', index=False)