<div style="background-color:#e2f0d9; padding:15px; border-radius:15px; color:#003366; font-family:Arial; font-size:18px;">

<span style="font-size:30px; font-weight:bold;">`libraries` Import, `Data` Import & `MYSQL` Setup</span><br><br>
- Import libraries
- Load CSVs
- Create in-memory MYSQL database
- Load tables into SQL
- View schema, count rows
</div>

<div style="background-color:#f0f8ff; padding:15px; border-radius:12px; color:#003366; font-family:Arial; font-size:17px;">
<b>Import libraries</b>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from tabulate import tabulate
from sqlalchemy import create_engine, text, Table, Column, Integer, Float, String, MetaData, ForeignKey

<div style="background-color:#f0f8ff; padding:15px; border-radius:12px; color:#003366; font-family:Arial; font-size:17px;">
<b>Data import (Load CSVs)</b>

In [2]:
menu = pd.read_csv('Uber_Eats_USA_Restaurants_and_Menus/menus.csv')
restro = pd.read_csv('Uber_Eats_USA_Restaurants_and_Menus/restaurants.csv')

In [3]:
restro.isnull().sum()

id                  0
position            0
name                0
score           28167
ratings         28167
category           85
price_range     10617
full_address      453
zip_code          517
lat                 0
lng                 0
dtype: int64

In [4]:
menu.isnull().sum()

restaurant_id          0
category               0
name                   4
description      1452145
price                  0
dtype: int64

In [5]:
len(restro), len(menu)

(63469, 5117217)

In [None]:
# Remove ' USD' from the price column 
menu['price'] = menu['price'].str.replace(' USD', '', regex=False)
# Convert to float
menu['price'] = menu['price'].astype(float)

In [None]:
# Dictionary of 2-letter codes to full state names
state_code_to_name = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California',
    'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
    'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa',
    'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire',
    'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina',
    'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania',
    'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee',
    'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington',
    'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming',
    'DC': 'District of Columbia', 'PR': 'Puerto Rico'
}

# Reverse mapping: full names (uppercase) to canonical form
state_fullnames_upper = {name.upper(): name for name in state_code_to_name.values()}

def extract_state_name(address):
    if not isinstance(address, str):
        return None
    parts = [part.strip() for part in address.split(',')]
    for part in parts:
        upper_part = part.upper()
        
        # Case 1: If full name is present
        if upper_part in state_fullnames_upper:
            return state_fullnames_upper[upper_part]
        
        # Case 2: If 2-letter code is present
        if len(part) == 2 and upper_part in state_code_to_name:
            return state_code_to_name[upper_part]
        
    return None

restro['state'] = restro['full_address'].apply(extract_state_name)

In [6]:
menu = menu[:50212]
restro = restro[:687]

In [7]:
len(restro), len(menu)

(687, 50212)

<div style="background-color:#f0f8ff; padding:15px; border-radius:12px; color:#003366; font-family:Arial; font-size:25px;">
<b>Create in-memory MYSQL database</b>

In [8]:
from sqlalchemy import create_engine, Table, Column, Integer, BigInteger, Text, Float, MetaData, ForeignKey

# 1. MySQL connection details
username = "root"
password = "password"
host     = "localhost"
port     = "3306"
database = "sql_project_1"

# 2. Create engine for MySQL
engine = create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}")

# 3. Define metadata
metadata = MetaData()

# 4. Define restro table schema
restro_table = Table(
    'restro', metadata,
    Column('id', Integer, primary_key=True, nullable=False),
    Column('position', Integer),
    Column('name', Text, nullable=False),
    Column('score', Float),
    Column('ratings', Float),
    Column('category', Text),
    Column('price_range', Text),
    Column('full_address', Text),
    Column('zip_code',String(15)),
    Column('lat', Float),
    Column('lng', Float)
)

# 5. Define menu table schema with FOREIGN KEY
menu_table = Table(
    'menu', metadata,
    Column('restaurant_id', Integer, ForeignKey('restro.id')),  # âœ… fixed here
    Column('category', Text),
    Column('name', Text),
    Column('description', Text),
    Column('price', Float)
)

# 6. Create both tables in MySQL
metadata.create_all(engine)

<div style="background-color:#f0f8ff; padding:15px; border-radius:12px; color:#003366; font-family:Arial; font-size:25px;">
<b>Load tables into SQL</b>

In [9]:
restro.to_sql('restro', con=engine, if_exists='append', index=False, chunksize=10000)

687

In [10]:
menu.to_sql('menu', con=engine, if_exists='replace', index=False, chunksize=10000)

50212

<div style="background-color:#e2f0d9; padding:15px; border-radius:15px; color:#003366; font-family:Arial; font-size:18px;">

<span style="font-size:30px; font-weight:bold;">Checking for `duplicates` and `null values`</span><br>
- Checking Duplicates.
- Checking null values.

</div>

<div style="background-color:#f0f8ff; padding:15px; border-radius:12px; color:#003366; font-family:Arial; font-size:17px;">
<b>View schema</b>

In [11]:
df = pd.read_sql("""
show tables;
"""
, con=engine)
print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+---------------------------+
| Tables_in_sql_project_1   |
|---------------------------|
| menu                      |
| restro                    |
+---------------------------+


In [12]:
df = pd.read_sql("""
SHOW COLUMNS FROM restro;
""", con=engine)
print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+--------------+-------------+--------+-------+-----------+----------------+
| Field        | Type        | Null   | Key   | Default   | Extra          |
|--------------+-------------+--------+-------+-----------+----------------|
| id           | int         | NO     | PRI   |           | auto_increment |
| position     | int         | YES    |       |           |                |
| name         | text        | NO     |       |           |                |
| score        | float       | YES    |       |           |                |
| ratings      | float       | YES    |       |           |                |
| category     | text        | YES    |       |           |                |
| price_range  | text        | YES    |       |           |                |
| full_address | text        | YES    |       |           |                |
| zip_code     | varchar(15) | YES    |       |           |                |
| lat          | float       | YES    |       |           |                |

In [13]:
df = pd.read_sql("""
SHOW COLUMNS FROM menu;
""", con=engine)
print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+---------------+--------+--------+-------+-----------+---------+
| Field         | Type   | Null   | Key   | Default   | Extra   |
|---------------+--------+--------+-------+-----------+---------|
| restaurant_id | bigint | YES    |       |           |         |
| category      | text   | YES    |       |           |         |
| name          | text   | YES    |       |           |         |
| description   | text   | YES    |       |           |         |
| price         | text   | YES    |       |           |         |
+---------------+--------+--------+-------+-----------+---------+


<div style="background-color:#f0f8ff; padding:15px; border-radius:12px; color:#003366; font-family:Arial; font-size:17px;">
<b>Count rows</b>

In [14]:
df = pd.read_sql("""
SELECT count(*) as Rows_count
FROM restro;
""", con=engine)
print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+--------------+
|   Rows_count |
|--------------|
|          687 |
+--------------+


In [15]:
df = pd.read_sql("""
SELECT count(*) as Rows_count
FROM menu;
""", con=engine)
print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+--------------+
|   Rows_count |
|--------------|
|        50212 |
+--------------+


<div style="background-color:#f0f8ff; padding:15px; border-radius:12px; color:#003366; font-family:Arial; font-size:17px;">
<b>Checking for Duplicates.</b>

In [20]:

df = pd.read_sql("""
SELECT COUNT(*) AS fully_duplicated_rows
FROM (
    SELECT COUNT(*) AS cnt
    FROM restro
    GROUP BY id, position, name, score, ratings, category, price_range,
             full_address, zip_code, lat, lng
    HAVING COUNT(*) > 1
) AS sub;
""", con=engine)

print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+-------------------------+
|   fully_duplicated_rows |
|-------------------------|
|                       0 |
+-------------------------+


In [18]:
df = pd.read_sql("""

SELECT COUNT(*) AS fully_duplicated_rows
FROM (
    SELECT COUNT(*) AS cnt
    FROM menu
    GROUP BY restaurant_id, category, name, description, price
    HAVING COUNT(*) > 1
) AS sub;

""", con=engine)
print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+-------------------------+
|   fully_duplicated_rows |
|-------------------------|
|                      53 |
+-------------------------+


<div style="background-color:#f0f8ff; padding:15px; border-radius:12px; color:#003366; font-family:Arial; font-size:17px;">
<b>Checking null values in restro.</b>

#### null values count in `restro`

In [22]:
df = pd.read_sql("""
SELECT
  SUM(id IS NULL) AS null_id,
  SUM(position IS NULL) AS null_position,
  SUM(name IS NULL) AS null_name,
  SUM(score IS NULL) AS null_score,
  SUM(ratings IS NULL) AS null_ratings,
  SUM(category IS NULL) AS null_category,
  SUM(price_range IS NULL) AS null_price_range,
  SUM(full_address IS NULL) AS null_full_address,
  SUM(zip_code IS NULL) AS null_zip_code,
  SUM(lat IS NULL) AS null_lat,
  SUM(lng IS NULL) AS null_lng
FROM restro;
""", con=engine)
print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+-----------+-----------------+-------------+--------------+----------------+-----------------+--------------------+---------------------+-----------------+------------+------------+
|   null_id |   null_position |   null_name |   null_score |   null_ratings |   null_category |   null_price_range |   null_full_address |   null_zip_code |   null_lat |   null_lng |
|-----------+-----------------+-------------+--------------+----------------+-----------------+--------------------+---------------------+-----------------+------------+------------|
|         0 |               0 |           0 |          344 |            344 |               0 |                116 |                   4 |               4 |          0 |          0 |
+-----------+-----------------+-------------+--------------+----------------+-----------------+--------------------+---------------------+-----------------+------------+------------+


#### null values % in `restro`

In [24]:
df = pd.read_sql("""
SELECT
  ROUND(SUM(score IS NULL) * 100 / COUNT(*), 2) AS null_score_pct,
  ROUND(SUM(ratings IS NULL) * 100 / COUNT(*), 2) AS null_ratings_pct,
  ROUND(SUM(category IS NULL) * 100 / COUNT(*), 2) AS null_category_pct,
  ROUND(SUM(price_range IS NULL) * 100 / COUNT(*), 2) AS null_price_range_pct,
  ROUND(SUM(full_address IS NULL) * 100 / COUNT(*), 2) AS null_full_address_pct,
  ROUND(SUM(zip_code IS NULL) * 100 / COUNT(*), 2) AS null_zip_code_pct
FROM restro;
""", con=engine)

print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+------------------+--------------------+---------------------+------------------------+-------------------------+---------------------+
|   null_score_pct |   null_ratings_pct |   null_category_pct |   null_price_range_pct |   null_full_address_pct |   null_zip_code_pct |
|------------------+--------------------+---------------------+------------------------+-------------------------+---------------------|
|            50.07 |              50.07 |                   0 |                  16.89 |                    0.58 |                0.58 |
+------------------+--------------------+---------------------+------------------------+-------------------------+---------------------+


<div style="background-color:#f0f8ff; padding:15px; border-radius:12px; color:#003366; font-family:Arial; font-size:17px;">
<b>Checking null values in menu.</b>

#### null values count in `menu` table.

In [25]:
df = pd.read_sql("""
SELECT
  SUM(restaurant_id IS NULL) AS null_restaurant_id,
  SUM(category IS NULL) AS null_category,
  SUM(name IS NULL) AS null_name,
  SUM(description IS NULL) AS null_description,
  SUM(price IS NULL) AS null_price
FROM menu;
""", con=engine)

print(tabulate(df, headers="keys", tablefmt="psql", showindex=False)) 

+----------------------+-----------------+-------------+--------------------+--------------+
|   null_restaurant_id |   null_category |   null_name |   null_description |   null_price |
|----------------------+-----------------+-------------+--------------------+--------------|
|                    0 |               0 |           0 |              13133 |            0 |
+----------------------+-----------------+-------------+--------------------+--------------+


#### null values percentage in `menu` table.

In [26]:
df = pd.read_sql("""
SELECT
  ROUND(SUM(name IS NULL) * 100.0 / COUNT(*), 2) AS null_name_pct,
  ROUND(SUM(description IS NULL) * 100.0 / COUNT(*), 2) AS null_description_pct  
FROM menu;
""", con=engine)

print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))

+-----------------+------------------------+
|   null_name_pct |   null_description_pct |
|-----------------+------------------------|
|               0 |                  26.16 |
+-----------------+------------------------+
