<a href="https://colab.research.google.com/github/AlwinJose-21/Data-Science-Projects/blob/main/SwiggyFoodAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df = pd.read_csv('/content/swiggy_file.csv')
df

In [None]:
#The average price is replaced from 'â‚¹50 for one' to 50
df['Average Price'] = df['Average Price'].str.extract('(\d+)').fillna(0).astype(int)

In [None]:
df

In [None]:
df.head(2)

In [None]:
df.tail(2)

In [None]:
#The restaurants with rating '--' is replaced with zero
df['Rating'] = df['Rating'].replace('--', '0')

In [None]:
df.isna().sum()

In [None]:
df['Restaurant Name'].value_counts()

In [None]:
#The restaurants with rating '-' is replaced with zero
df['Number of Ratings'] = df['Number of Ratings'].replace('Too Few Ratings', '0')

In [None]:
df['Number of Ratings'] = df['Number of Ratings'].str.replace(r'\+.*', '', regex=True)

In [None]:
df

In [None]:
df['Number of Ratings'] = df['Number of Ratings'].str.replace(r'\D', '', regex=True)
df

In [None]:
df['Offer Name'] = df['Offer Name'].str.replace('\n', ',')
df

In [None]:

pure_veg_counts = df['Pure Veg'].value_counts().reset_index()
pure_veg_counts.columns = ['Pure Veg', 'Count']

fig = px.pie(
    pure_veg_counts,
    names='Pure Veg',
    values='Count',
    title='Distribution of Pure Vegetarian Restaurants',
    color_discrete_sequence=px.colors.qualitative.Dark2,
    labels={'Pure Veg': 'Yes', 'Non-Veg': 'No'},
    template='seaborn'
)

fig.show()


In [None]:
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Rating Category'] = df['Rating'].apply(lambda x: '0-3' if pd.isna(x) or x < 3 else ('3-4' if x < 4 else '4+'))
rating_counts = df['Rating Category'].value_counts()
colors = ['gold', 'lightcoral', 'lightskyblue']

plt.figure(figsize=(4, 4))
plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', startangle=90, colors=colors)
plt.title('Distribution of Ratings')
plt.show()

In [None]:
#After getting the unique prices, we can divide them into different categories
unique_prices = [50, 100, 150, 200, 250, 300, 500, 400, 350, 450, 40, 0, 550, 48, 1, 600, 32, 900, 750, 800, 650, 10, 12, 850, 2]

# Define the price ranges
price_ranges = {
    'Less than 50': lambda x: x < 50,
    '100-200': lambda x: 100 <= x < 200,
    '200-300': lambda x: 200 <= x < 300,
    '300-400': lambda x: 300 <= x < 400,
    '400-500': lambda x: 400 <= x < 500,
    '500-700': lambda x: 500 <= x < 700,
    'Greater than 700': lambda x: x > 700
}
price_counts = {range_name: sum(price_range(price) for price in unique_prices) for range_name, price_range in price_ranges.items()}


labels = price_counts.keys()
sizes = price_counts.values()

fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax.axis('equal')

plt.title('Distribution of Average Prices')
plt.show()

In [None]:
sns.boxplot(x="Rating Category", y="Average Price", data=df)
plt.title("Average Price by Rating Category")
plt.show()


In [None]:
def remove_outliers(df, column, s=1.5):
  # Calculate the interquartile range (IQR)
  q1 = df[column].quantile(0.25)
  q3 = df[column].quantile(0.75)
  iqr = q3 - q1

  # Define the lower and upper bounds for outliers
  lower_bound = q1 - s * iqr
  upper_bound = q3 + s * iqr

  # Filter the data to remove outliers
  cleaned_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

  return cleaned_df

df_cleaned = remove_outliers(df, 'Average Price')
sns.boxplot(x="Rating Category", y="Average Price", data=df_cleaned)
plt.title("Average Price by Rating Category (Outliers Removed)")
plt.show()


In [None]:
#To find all possible Cuisine in the dataset
cuisine_column = df['Cuisine']

all_cuisines = [cuisine.split(', ') for cuisine in cuisine_column if pd.notna(cuisine)]
unique_cuisines = set([c for sublist in all_cuisines for c in sublist])

print("All Possible Cuisines:", unique_cuisines)

In [None]:
cuisine_counts = df['Cuisine'].str.split(', ').explode().value_counts()
top_15_cuisines = cuisine_counts.head(15)
plot_data = pd.DataFrame({'Cuisine': top_15_cuisines.index, 'Frequency': top_15_cuisines.values})
colors = px.colors.qualitative.Set3

fig = px.bar(plot_data, x='Frequency', y='Cuisine', color='Cuisine',
             color_discrete_sequence=colors,
             orientation='h', labels={'Frequency': 'Frequency'},
             title='Top 15 Cuisines', width=800, height=500)

fig.update_layout(showlegend=False)
fig.show()

In [None]:
average_price = df['Average Price']
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Define rating categories
def categorize_rating(rating):
    if rating < 3:
        return '<3'
    elif 3 <= rating < 4:
        return '3-4'
    else:
        return '4+'
df['Rating Category'] = df['Rating'].apply(categorize_rating)

# Define a color palette for each category
palette = {'<3': 'red', '3-4': 'orange', '4+': 'green'}

plt.figure(figsize=(10, 6))
sns.scatterplot(x=average_price, y=df['Rating'], hue=df['Rating Category'], palette=palette, alpha=0.7)
plt.title('Relation between Average Price and Rating')
plt.xlabel('Average Price')
plt.ylabel('Rating')
plt.legend(title='Rating Category')
plt.show()

In [None]:
df

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df['Rating'].fillna(df['Rating'].mode()[0],inplace=True)
df['Rating']

In [None]:
df.columns

In [None]:
df.sample(5).T

In [None]:
df['Number of Ratings'].fillna(1,inplace=True)
df['Number of Ratings']

In [None]:
df['Number of Ratings']=df['Number of Ratings'].astype(np.int64)

In [None]:
df['Offer Name'].fillna(df['Offer Name'].mode(),inplace=True)
df['Offer Name']

In [None]:
x = df.drop(['Rating'], axis=1)
x

In [None]:
df.isna().sum()

In [None]:
df['Number of Offers'].fillna(2.0,inplace=True)
df['Area'].fillna('Circular Road',inplace=True)
df['Pure Veg'].fillna('No',inplace=True)
df['Location'].fillna('Bhopal',inplace=True)
df['Offer Name'].fillna('None',inplace=True)
df['Cuisine'].fillna(df['Cuisine'].mode()[0],inplace =True)

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
# Function to perform Label Encoding for multiple columns
def label_encode_multiple(df, columns):
    le = LabelEncoder()
    for column in columns:
        df[column] = le.fit_transform(df[column])
    return df

label_encode_columns = ['Restaurant Name','Cuisine','Offer Name', 'Area', 'Pure Veg', 'Location', 'Rating Category']
df = label_encode_multiple(df, label_encode_columns)

In [None]:
from sklearn.preprocessing import StandardScaler
# Standardization
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df)
df = pd.DataFrame(df_standardized, columns=df.columns)

In [None]:
x = df.drop(['Rating'],axis=1).values
x

In [None]:
y = df['Rating'].values
y

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
x_train

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'fit_intercept': [True, False],
    'copy_X': [True, False]
}

linear_regression = LinearRegression()

grid_search = GridSearchCV(linear_regression, param_grid, cv=5)

grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_


print("Best parameters:", grid_search.best_params_)

y_pred = best_model.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'R-squared: {r2:.2f}')
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the random forest model
random_forest = RandomForestRegressor()

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(random_forest, param_grid, cv=5)
grid_search.fit(x_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'R-squared: {r2:.2f}')
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
