# AIPI 510: Team Assignment 2 - Data Storage and Access (SQL)

## Owais Kamdar, Rajiv Raman

We are submitting our code for this assignment in the Python notebook format. **To run the code, click "Run All" to evaluate all the cells.** Comments throughout the code will indicate where the CREATE, READ, UPDATE, and DELETE requirements are satisfied.

In [2]:
import sqlite3
import pandas as pd
import csv
import numpy as np
import os

# define the sqlquery function to execute a given SQL query on a given SQL database connection

def sqlquery(connection,prompt):
    try:
        cur = connection.cursor() # try to create a cursor for an SQL query
        cur.execute(prompt) # prompt parameter is fed in as SQL query
        results = cur.fetchall() # store all results from the query in a variable
        if len(results) > 0: # if the query returned a nonzero number of results
            return results # return the results
        else:
            return "No results found." # return the message that no data entry matched the query
        
    except sqlite3.Error as e:
        print(f"SQL query execution error: {e}") # print error message if the SQL query cannot be performed

try: 
    conn = sqlite3.connect("test1") # open the connection to an SQL database named test1
    print("Connected to the SQL Database! Executing all queries...\n")
    
    data = pd.read_csv("/Users/rajivraman/Downloads/tips.csv") # store the data from tips.csv in this variable
    
    try:
        data.to_sql('table1',conn,index=False,if_exists='replace') # the CREATE step - forms the SQL database from tips.csv
    except Exception as e:
        print(f"Data upload error: {e}") # print error message if data cannot be uploaded to the SQL database
        
    # the READ step is fully satisfied below - all 10 requested SQL queries plus 5 custom queries have been written
        
    # SQL query for #1. Retrieve the average tip percentage for each day of the week.
        
    sql1 = sqlquery(conn,"SELECT day, AVG(tip/total_bill * 100) FROM table1 GROUP BY day")
    
    # SQL query for #2. Find the minimum and maximum total bill amounts.
    
    sql2 = sqlquery(conn,"SELECT MIN(total_bill), MAX(total_bill) FROM table1")
    
    # SQL query for #3. Count the number of parties for each size.
    
    sql3 = sqlquery(conn,"SELECT size, COUNT(total_bill) FROM table1 GROUP BY size")
    
    # SQL query for #4. Retrieve the total bill and tip for parties of size 4 or more, where the tip percentage is greater than 15%.
    
    sql4 = sqlquery(conn,"SELECT size, total_bill, tip, tip/total_bill * 100 FROM table1 WHERE size >= 4 AND tip/total_bill * 100 > 15")
    
    # SQL query for #5. Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order.
    
    sql5 = sqlquery(conn,"SELECT day, time, tip, total_bill, tip/total_bill * 100 FROM table1 ORDER BY day, time, tip/total_bill * 100 DESC")
    
    # SQL query for #6. Find the average tip percentage for each combination of day, time, and smoker status.
    
    sql6 = sqlquery(conn,"SELECT day, time, smoker, AVG(tip/total_bill * 100) FROM table1 GROUP BY day, time, smoker")

    # SQL query for #7. Retrieve the total bill, tip amount, and tip percentage for each sex, sorted by total bill in descending order, and limit the results to the top 5 records.
    
    sql7 = sqlquery(conn,"SELECT sex, total_bill, tip, tip/total_bill * 100 FROM table1 ORDER BY total_bill DESC LIMIT 5")
    
    # SQL query for #8. Find the maximum and minimum tip percentage for each day and time combination, along with the corresponding total bill and tip amount.
    
    sql8 = sqlquery(conn,"SELECT day, time, tip/total_bill * 100, tip, total_bill FROM table1 WHERE tip/total_bill * 100 = (SELECT MAX(tip/total_bill * 100) FROM table1 AS temp WHERE temp.day = table1.day AND temp.time = table1.time) OR tip/total_bill * 100 = (SELECT MIN(tip/total_bill * 100) FROM table1 AS temp WHERE temp.day = table1.day AND temp.time = table1.time) ORDER BY day, time, tip/total_bill * 100")
    
    # SQL query for #9. Retrieve the total bill, tip amount, and tip percentage for parties of size 4 or more, where the tip percentage is greater than 15%, and the total bill is between $50 and $100.
    
    sql9 = sqlquery(conn,"SELECT total_bill, tip, tip/total_bill * 100 FROM table1 WHERE size >= 4 AND tip/total_bill * 100 > 15 AND total_bill <= 100 AND total_bill >= 50")
    
    # SQL query for #10. Find the average tip percentage for each combination of day, time, and smoker status, but only include combinations with more than 5 records.
    
    sql10 = sqlquery(conn,"SELECT day, time, smoker, AVG(tip/total_bill * 100) FROM table1 GROUP BY day, time, smoker HAVING COUNT(*) > 5")
    
    # Extra SQL query #1. Compute the total tip per day.
    
    xsql1 = sqlquery(conn,"SELECT day, SUM(tip) AS total_tip FROM table1 GROUP BY day ORDER BY total_tip DESC")
    
    # Extra SQL query #2. Compute the highest total bill for each party size.
    
    xsql2 = sqlquery(conn,"SELECT size, MAX(total_bill) AS max_total_bill FROM table1 GROUP BY size ORDER BY size")
    
    # Extra SQL query #3. Compute the average tip for each gender.
    
    xsql3 = sqlquery(conn,"SELECT sex, AVG(tip) AS avg_tip FROM table1 GROUP BY sex")
    
    # Extra SQL query #4. Compute the top 3 highest tips for each day.
    
    xsql4 = sqlquery(conn,"WITH ranked_tips AS (SELECT day, tip, ROW_NUMBER() OVER (PARTITION BY day ORDER BY tip DESC) AS rank FROM table1) SELECT day, tip FROM ranked_tips WHERE rank <= 3 ORDER BY day, tip DESC")
    
    # Extra SQL query #5. Compute the number of smokers vs. number of non-smokers by time.
    
    xsql5 = sqlquery(conn, "SELECT time, smoker, COUNT(*) AS count FROM table1 GROUP BY time, smoker ORDER BY time, smoker")
    
    sqlquery(conn,"UPDATE table1 SET smoker = 'Yes' WHERE total_bill = 14.78") # the UPDATE step - changes the smoker entry at ID 10 (not defined inherently in SQL database, so had to pull the $14.78 from entry 10 in spreadsheet) in tips.csv to 'Yes'
    
    sqlquery(conn,"DELETE FROM table1 WHERE total_bill < 10") # the DELETE step - removes all data entries with a total bill less than $10
    
except sqlite3.Error as e:
    print(f"Database connection error: {e}") # print error message if we cannot connect to the database
    
finally: # if everything in the "try" block is finished
    if conn: # and if the SQL connection still exists
        conn.close() # close the connection to the SQL database
        print("\nYour database connection has been closed.")

Connected to the SQL Database! Executing all queries...


Your database connection has been closed.


In [3]:
print("1. Retrieve the average tip percentage for each day of the week.") # question corresponding to SQL query 1
df1 = pd.DataFrame(sql1,columns=["Day","Mean Tip Percent"]) # create a table with appropriate columns
display(df1) # display the table

1. Retrieve the average tip percentage for each day of the week.


Unnamed: 0,Day,Mean Tip Percent
0,Fri,16.991303
1,Sat,15.315172
2,Sun,16.689729
3,Thur,16.127563


In [4]:
print("2. Find the minimum and maximum total bill amounts.") # question corresponding to SQL query 2
df2 = pd.DataFrame(sql2,columns=["Minimum Total Bill","Maximum Total Bill"]) # create a table with appropriate columns
display(df2) # display the table

2. Find the minimum and maximum total bill amounts.


Unnamed: 0,Minimum Total Bill,Maximum Total Bill
0,3.07,50.81


In [5]:
print("3. Count the number of parties for each size.") # question corresponding to SQL query 3
df3 = pd.DataFrame(sql3,columns=["Size","Number of Parties"]) # create a table with appropriate columns
display(df3) # display the table

3. Count the number of parties for each size.


Unnamed: 0,Size,Number of Parties
0,1,4
1,2,156
2,3,38
3,4,37
4,5,5
5,6,4


In [6]:
print("4. Retrieve the total bill and tip for parties of size 4 or more, where the tip percentage is greater than 15%.") # question corresponding to SQL query 4
df4 = pd.DataFrame(sql4,columns=["Size","Total Bill","Tip","Tip Percent"]) # create a table with appropriate columns
display(df4) # display the table

4. Retrieve the total bill and tip for parties of size 4 or more, where the tip percentage is greater than 15%.


Unnamed: 0,Size,Total Bill,Tip,Tip Percent
0,4,25.29,4.71,18.623962
1,4,18.43,3.0,16.277808
2,4,39.42,7.58,19.228818
3,4,30.4,5.6,18.421053
4,4,32.4,6.0,18.518519
5,4,25.56,4.34,16.979656
6,4,18.29,3.76,20.557682
7,4,29.93,5.07,16.939526
8,6,34.3,6.7,19.533528
9,6,27.05,5.0,18.484288


In [7]:
print("5. Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order.") # question corresponding to SQL query 5
df5 = pd.DataFrame(sql5,columns=["Day","Time","Tip","Total Bill","Tip Percent"]) # create a table with appropriate columns
display(df5) # display the table

5. Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order.


Unnamed: 0,Day,Time,Tip,Total Bill,Tip Percent
0,Fri,Dinner,4.30,16.32,26.348039
1,Fri,Dinner,2.50,11.35,22.026432
2,Fri,Dinner,3.00,15.38,19.505852
3,Fri,Dinner,1.00,5.75,17.391304
4,Fri,Dinner,3.50,22.49,15.562472
...,...,...,...,...,...
239,Thur,Lunch,5.00,43.11,11.598237
240,Thur,Lunch,2.18,22.82,9.553024
241,Thur,Lunch,2.56,28.44,9.001406
242,Thur,Lunch,1.50,19.08,7.861635


In [8]:
print("6. Find the average tip percentage for each combination of day, time, and smoker status.") # question corresponding to SQL query 6
df6 = pd.DataFrame(sql6,columns=["Day","Time","Smoker","Mean Tip Percent"]) # create a table with appropriate columns
display(df6) # display the table

6. Find the average tip percentage for each combination of day, time, and smoker status.


Unnamed: 0,Day,Time,Smoker,Mean Tip Percent
0,Fri,Dinner,No,13.962237
1,Fri,Dinner,Yes,16.534736
2,Fri,Lunch,No,18.773467
3,Fri,Lunch,Yes,18.893659
4,Sat,Dinner,No,15.804766
5,Sat,Dinner,Yes,14.790607
6,Sun,Dinner,No,16.011294
7,Sun,Dinner,Yes,18.725032
8,Thur,Dinner,No,15.974441
9,Thur,Lunch,No,16.031067


In [9]:
print("7. Retrieve the total bill, tip amount, and tip percentage for each sex, sorted by total bill in descending order, and limit the results to the top 5 records.") # question corresponding to SQL query 7
df7 = pd.DataFrame(sql7,columns=["Sex","Total Bill","Tip","Tip Percent"]) # create a table with appropriate columns
display(df7) # display the table

7. Retrieve the total bill, tip amount, and tip percentage for each sex, sorted by total bill in descending order, and limit the results to the top 5 records.


Unnamed: 0,Sex,Total Bill,Tip,Tip Percent
0,Male,50.81,10.0,19.681165
1,Male,48.33,9.0,18.621974
2,Male,48.27,6.73,13.942407
3,Male,48.17,5.0,10.379905
4,Male,45.35,3.5,7.717751


In [10]:
print("8. Find the maximum and minimum tip percentage for each day and time combination, along with the corresponding total bill and tip amount.") # question corresponding to SQL query 8
df8 = pd.DataFrame(sql8,columns=["Day","Time","Max or Min Tip Percent","Tip","Total Bill"]) # create a table with appropriate columns
display(df8) # display the table

8. Find the maximum and minimum tip percentage for each day and time combination, along with the corresponding total bill and tip amount.


Unnamed: 0,Day,Time,Max or Min Tip Percent,Tip,Total Bill
0,Fri,Dinner,10.35554,3.0,28.97
1,Fri,Dinner,26.348039,4.3,16.32
2,Fri,Lunch,11.773472,1.58,13.42
3,Fri,Lunch,25.931446,3.48,13.42
4,Sat,Dinner,3.563814,1.17,32.83
5,Sat,Dinner,32.57329,1.0,3.07
6,Sun,Dinner,5.944673,1.01,16.99
7,Sun,Dinner,71.034483,5.15,7.25
8,Thur,Dinner,15.974441,3.0,18.78
9,Thur,Lunch,7.296137,1.36,18.64


In [138]:
print("9. Retrieve the total bill, tip amount, and tip percentage for parties of size 4 or more, where the tip percentage is greater than 15%, and the total bill is between $50 and $100.\n") # question corresponding to SQL query 9
display(sql9) # display the output of the sqlquery function

9. Retrieve the total bill, tip amount, and tip percentage for parties of size 4 or more, where the tip percentage is greater than 15%, and the total bill is between $50 and $100.



'No results found.'

In [11]:
print("10. Find the average tip percentage for each combination of day, time, and smoker status, but only include combinations with more than 5 records.") # question corresponding to SQL query 10
df10 = pd.DataFrame(sql10,columns=["Day","Time","Smoker","Mean Tip Percent"]) # create a table with appropriate columns
display(df10) # display the table

10. Find the average tip percentage for each combination of day, time, and smoker status, but only include combinations with more than 5 records.


Unnamed: 0,Day,Time,Smoker,Mean Tip Percent
0,Fri,Dinner,Yes,16.534736
1,Fri,Lunch,Yes,18.893659
2,Sat,Dinner,No,15.804766
3,Sat,Dinner,Yes,14.790607
4,Sun,Dinner,No,16.011294
5,Sun,Dinner,Yes,18.725032
6,Thur,Lunch,No,16.031067
7,Thur,Lunch,Yes,16.386327


In [12]:
print("X1. Compute the total tip per day.") # question corresponding to extra SQL query 1
dfx1 = pd.DataFrame(xsql1,columns=["Day","Total Tip"]) # create a table with appropriate columns
display(dfx1) # display the table

X1. Compute the total tip per day.


Unnamed: 0,Day,Total Tip
0,Sat,260.4
1,Sun,247.39
2,Thur,171.83
3,Fri,51.96


In [13]:
print("X2. Compute the highest total bill for each party size.") # question corresponding to extra SQL query 2
dfx2 = pd.DataFrame(xsql2,columns=["Size","Highest Total Bill"]) # create a table with appropriate columns
display(dfx2) # display the table

X2. Compute the highest total bill for each party size.


Unnamed: 0,Size,Highest Total Bill
0,1,10.07
1,2,40.55
2,3,50.81
3,4,48.33
4,5,41.19
5,6,48.17


In [14]:
print("X3. Compute the average tip for each sex.") # question corresponding to extra SQL query 3
dfx3 = pd.DataFrame(xsql3,columns=["Sex","Mean Tip"]) # create a table with appropriate columns
display(dfx3) # display the table

X3. Compute the average tip for each sex.


Unnamed: 0,Sex,Mean Tip
0,Female,2.833448
1,Male,3.089618


In [15]:
print("X4. Compute the top 3 highest tips for each day.") # question corresponding to extra SQL query 4
dfx4 = pd.DataFrame(xsql4,columns=["Day","3 Highest Tips Per Day"]) # create a table with appropriate columns
display(dfx4) # display the table

X4. Compute the top 3 highest tips for each day.


Unnamed: 0,Day,3 Highest Tips Per Day
0,Fri,4.73
1,Fri,4.3
2,Fri,4.0
3,Sat,10.0
4,Sat,9.0
5,Sat,7.58
6,Sun,6.5
7,Sun,6.0
8,Sun,5.65
9,Thur,6.7


In [16]:
print("X5. Compute the number of smokers vs. number of non-smokers by time.") # question corresponding to extra SQL query 1
dfx5 = pd.DataFrame(xsql5,columns=["Time","Smoker","Count"]) # create a table with appropriate columns
display(dfx5) # display the table

X5. Compute the number of smokers vs. number of non-smokers by time.


Unnamed: 0,Time,Smoker,Count
0,Dinner,No,106
1,Dinner,Yes,70
2,Lunch,No,45
3,Lunch,Yes,23
