In [101]:
import sqlite3
import pandas as pd
from contextlib import closing

In [75]:
# Read CSV file 
csv_file = '/Users/owner/Desktop/data_science/ml_course/aipi510/aipi510-fall24/data/tips.csv'
df = pd.read_csv(csv_file)

# Create an SQLite connection (a new database file)
conn = sqlite3.connect('tip')  

# Insert the DataFrame into an SQLite table
df.to_sql('tip', conn, if_exists='replace', index=False)

244

In [76]:
cursor = conn.cursor()
cursor.execute("SELECT * FROM tip LIMIT 5")

<sqlite3.Cursor at 0x12626c340>

In [77]:
# Fetch all rows
rows = cursor.fetchall()
display = pd.DataFrame()
for i,row in enumerate(rows):
    display = pd.concat([display, pd.DataFrame(list(row)).T], axis="rows")
display.columns = df.columns
display["id"] = range(display.shape[0])
display
conn.close()

In [78]:
display

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,id
0,16.99,1.01,Female,No,Sun,Dinner,2,0
0,10.34,1.66,Male,No,Sun,Dinner,3,1
0,21.01,3.5,Male,No,Sun,Dinner,3,2
0,23.68,3.31,Male,No,Sun,Dinner,2,3
0,24.59,3.61,Female,No,Sun,Dinner,4,4


Make the main function that performs the tasks with built-in features in the docstring

In [121]:
def make_query(conn, sql_query, columns, commit=False):
    """_summary_
    Run a SQL query into a connected database, with option to commit. 
    If there is no connection, make one linking to the directory of the csv file.
    If there are sources of error, it will be displayed. Finally, close the SQL query
    Args:
        conn (_type_): a sqlite connection, if already done
        sql_query (_type_): _description_
    """
    if conn is None:
        # Read CSV file 
        csv_file = '/Users/owner/Desktop/data_science/ml_course/aipi510/aipi510-fall24/data/tips.csv'
        df = pd.read_csv(csv_file)
        df["id"] = range(df.shape[0])

        # Create an SQLite connection "tips" (a new database file)
        with closing(sqlite3.connect("tips")) as conn:
            # Create a cursor 
            with closing(conn.cursor()) as cursor: 
                # Insert the DataFrame into an SQLite table
                df.to_sql('tips', conn, if_exists='replace', index=False)

                # Run sql query
                try:
                    cursor.execute(sql_query)
                except ValueError as ve:
                    print(f"There is something wrong with the input value(s). Check data type and values. {ve}")
                except SyntaxError as se:
                    print(f"There is a syntax error within query {se}")

                # fetch all matching rows
                rows = cursor.fetchall()

                display = pd.DataFrame()

                for row in rows:
                    display = pd.concat([display, pd.DataFrame(list(row)).T], axis="rows")
                try: 
                    if columns is None:
                        display.columns = df.columns
                    else:
                        display.columns = columns
                except AttributeError: # find the exception later 
                    print("DataFrame not generated. This may be caused by an error with columns or an empty query.")

            # user option to save changes
            if commit is True:
                conn.commit()

            return display

In [124]:
cols

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size', 'id']

In [125]:
cols = list(df.columns) + ["id"]
make_query(None, "SELECT * FROM tips LIMIT 5", cols)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,id
0,16.99,1.01,Female,No,Sun,Dinner,2,0
0,10.34,1.66,Male,No,Sun,Dinner,3,1
0,21.01,3.5,Male,No,Sun,Dinner,3,2
0,23.68,3.31,Male,No,Sun,Dinner,2,3
0,24.59,3.61,Female,No,Sun,Dinner,4,4


1. Retrieve the average tip percentage for each day of the week

In [226]:
query = "SELECT day, 100*avg(tip/total_bill) FROM tips GROUP BY day"
make_query(None, query, ["day", "avg_tip_percentage(%)"])

Unnamed: 0,day,avg_tip_percentage(%)
0,Fri,16.991303
0,Sat,15.315172
0,Sun,16.689729
0,Thur,16.127563


3. Count the number of parties for each size

In [126]:
query = "SELECT size, COUNT(*) FROM tips GROUP BY size"
make_query(None, query, ["size", "counts"])

Unnamed: 0,size,counts
0,1,4
0,2,156
0,3,38
0,4,37
0,5,5
0,6,4


5. Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order

In [150]:
query = """
SELECT day, time, total_bill, tip, 100*tip/total_bill AS tip_percentage
FROM tips 
GROUP BY day, time
ORDER BY tip_percentage DESC
"""
make_query(None, query, ["day", "time", "total_bill", "tip", "tip_percentage(%)"])

Unnamed: 0,day,time,total_bill,tip,tip_percentage(%)
0,Fri,Lunch,12.16,2.2,18.092105
0,Sat,Dinner,20.65,3.35,16.22276
0,Thur,Dinner,18.78,3.0,15.974441
0,Thur,Lunch,27.2,4.0,14.705882
0,Fri,Dinner,28.97,3.0,10.35554
0,Sun,Dinner,16.99,1.01,5.944673


7. Retrieve the total bill, tip amount, and tip percentage for each sex, sorted by total bill in descending order, and limit the results to the top 5 records

For Males:

In [220]:
query = """
SELECT sex, total_bill, tip, 100*tip/total_bill AS tip_percentage, RANK() OVER(PARTITION BY sex ORDER BY tip/total_bill DESC) AS ranking
FROM tips
WHERE sex="Male"
LIMIT 5;
"""
make_query(None, query, ["sex", "total_bill", "tip", "tip_percentage(%)", "ranking"])

Unnamed: 0,sex,total_bill,tip,tip_percentage(%),ranking
0,Male,7.25,5.15,71.034483,1
0,Male,11.61,3.39,29.198966,2
0,Male,23.17,6.5,28.053517,3
0,Male,7.51,2.0,26.631158,4
0,Male,23.33,5.65,24.217745,5


For Females

In [185]:
query = """
SELECT sex, total_bill, tip, 100*tip/total_bill AS tip_percentage, RANK() OVER(PARTITION BY sex ORDER BY tip/total_bill DESC) AS ranking
FROM tips
WHERE sex="Female"
LIMIT 5;
"""
make_query(None, query, ["sex", "total_bill", "tip", "tip_percentage(%)", "ranking"])

Unnamed: 0,sex,total_bill,tip,tip_percentage(%),ranking
0,Female,9.6,4.0,41.666667,1
0,Female,3.07,1.0,32.57329,2
0,Female,14.31,4.0,27.952481,3
0,Female,16.32,4.3,26.348039,4
0,Female,13.42,3.48,25.931446,5


9. Retrieve the total bill, tip amount, and tip percentage for parties of size 4 or more, where the tip percentage is greater than 15%, and the total bill is between $50 and $100

In [233]:
query = """
SELECT total_bill, tip, 100*tip/total_bill AS tip_percentage, size
FROM tips
WHERE (size>=4 AND total_bill>=50 AND total_bill<=100) OR (size>=4 AND tip_percentage>15);
"""
make_query(None, query, ["total_bill", "tip", "tip_percentage(%)", "size"])

Unnamed: 0,total_bill,tip,tip_percentage(%),size
0,25.29,4.71,18.623962,4.0
0,18.43,3.0,16.277808,4.0
0,39.42,7.58,19.228818,4.0
0,30.4,5.6,18.421053,4.0
0,32.4,6.0,18.518519,4.0
0,25.56,4.34,16.979656,4.0
0,18.29,3.76,20.557682,4.0
0,29.93,5.07,16.939526,4.0
0,34.3,6.7,19.533528,6.0
0,27.05,5.0,18.484288,6.0


Extra Queries

1. Compare average tip percentages given by smokers versus non-smokers, for each unique party size

In [12]:
query = """
SELECT smoker, size, AVG(100*tip/total_bill) AS mean_tip_percentage
FROM tips
GROUP BY smoker, size;
"""
make_query(None, query, ["smoker", "size", "mean_tip_percentage(%)"])

Unnamed: 0,smoker,size,mean_tip_percentage(%)
0,No,1,15.982947
0,No,2,16.499568
0,No,3,14.967111
0,No,4,14.760449
0,No,5,17.841521
0,No,6,15.62292
0,Yes,1,27.475456
0,Yes,2,16.67058
0,Yes,3,15.754264
0,Yes,4,14.203605


2. Compare average total bills and tip percentages given by customers, for different days and party sizes, at dinners

In [15]:
query = """
SELECT day, size, AVG(total_bill) AS mean_total_bill, AVG(100*tip/total_bill) AS mean_tip_percentage
FROM tips
WHERE time="Dinner"
GROUP BY day, size
ORDER BY mean_tip_percentage DESC;
"""
make_query(None, query, ["day", "size", "mean_total_bill", "mean_tip_percentage(%)"])

Unnamed: 0,day,size,mean_total_bill,mean_tip_percentage(%)
0,Sat,1,5.16,23.183197
0,Sun,2,17.56,18.08696
0,Fri,2,17.799091,16.265852
0,Sun,5,27.0,15.983894
0,Thur,2,18.78,15.974441
0,Sat,2,16.83717,15.528884
0,Sun,4,26.688333,15.31682
0,Sun,3,22.184,15.266238
0,Sat,3,25.509444,15.143861
0,Sat,4,29.876154,13.828925


3. Get the total tip amount per day

In [135]:
query = ''' 
SELECT day, SUM(tip) as tip_sum
FROM tips
GROUP BY day
'''
make_query(None, query, ["day", "tip_sum"])

Unnamed: 0,day,tip_sum
0,Fri,51.96
0,Sat,260.4
0,Sun,247.39
0,Thur,171.83


4. Using a combination of total bill and tips, generate the split amount depending on the number of people

In [138]:
query = '''
SELECT day, time, smoker, size, ROUND((total_bill + tip)/size, 2) as split_amount
FROM tips
'''
make_query(None, query, ["day", "time", "smoker", "size", "split_amount"])

Unnamed: 0,day,time,smoker,size,split_amount
0,Sun,Dinner,No,2,9.0
0,Sun,Dinner,No,3,4.0
0,Sun,Dinner,No,3,8.17
0,Sun,Dinner,No,2,13.5
0,Sun,Dinner,No,4,7.05
...,...,...,...,...,...
0,Sat,Dinner,No,3,11.65
0,Sat,Dinner,Yes,2,14.59
0,Sat,Dinner,Yes,2,12.34
0,Sat,Dinner,No,2,9.79


5. Find the average group size for each combination of day, time, and smoker status

In [139]:
query = '''
SELECT day, time, smoker, ROUND(AVG(size),2) AS tip_percent_avg
FROM tips
GROUP BY day, time, smoker
'''
make_query(None, query, ["day", "time", "smoker", "tip_percent_avg"])

Unnamed: 0,day,time,smoker,tip_percent_avg
0,Fri,Dinner,No,2.0
0,Fri,Dinner,Yes,2.22
0,Fri,Lunch,No,3.0
0,Fri,Lunch,Yes,1.83
0,Sat,Dinner,No,2.56
0,Sat,Dinner,Yes,2.48
0,Sun,Dinner,No,2.93
0,Sun,Dinner,Yes,2.58
0,Thur,Dinner,No,2.0
0,Thur,Lunch,No,2.5


UPDATE (id=10 and set smoker to Yes)

In [133]:
query = """
UPDATE tips SET
smoker = "Yes"
WHERE id=9; 
"""
# id=10 means =9

with closing(sqlite3.connect("tips")) as conn:
    with closing(conn.cursor()) as cursor:
        cursor.execute(query)
    conn.commit()

DELETE

In [134]:
#Delete
query = '''
DELETE FROM tips 
WHERE total_bill < 10
'''
with closing(sqlite3.connect("tips")) as conn:
    with closing(conn.cursor()) as cursor:
        cursor.execute(query)
    conn.commit()