# Team Assignment 2 - SQL

### Team Impasta - Dayeon Kang, Shaunak Badani

Links used:

- [List of dataframe headers](https://stackoverflow.com/questions/19482970/get-a-list-from-pandas-dataframe-column-headers)
- [Join an array of strings](https://www.geeksforgeeks.org/python-string-join-method/)
- [Sqlite3 Documentation](https://docs.python.org/3/library/sqlite3.html#sqlite3-connection-context-manager)
- [Dataframe to array of tuples](https://stackoverflow.com/questions/9758450/pandas-convert-dataframe-to-array-of-tuples)

In [1]:
import pandas as pd
import sqlite3

#### CREATE: Creating the sqlite3 file from the csv

In [2]:
tips_dataframe = pd.read_csv("data/tips.csv")

In [3]:
database_column_headers = tips_dataframe.columns.values
table_headers_string = "tips(" + ",".join(database_column_headers) + ")"

In [4]:
# Creating the database, and the table schema
con = sqlite3.connect("data/tips.db")

try:
    with con:
        con.execute(f"CREATE TABLE {table_headers_string}")
        print("Database successfully created")
except Exception as e:
    print(e.with_traceback(None))

Database successfully created


In [5]:
questions_string = f"({','.join(['?' for _ in database_column_headers])})"
database_rows = list(tips_dataframe.itertuples(index = False, name = None))

In [6]:
try:
    with con:
        con.executemany(f"INSERT INTO {table_headers_string} VALUES {questions_string}", database_rows)
except Exception as e:
    print(e.with_traceback(None))


#### READ: Reading from the CSV

In [20]:
# 1. Retrieve the average tip percentage for each day of the week
average_tip_query = "SELECT AVG(tip / total_bill) * 100, day FROM tips GROUP BY day ORDER BY day"
with con:
    rows = con.execute(average_tip_query)
    print("(avg_tip_percentage, day)")
    for row in rows:
        print(row)

(avg_tip_percentage, day)
(16.991302873347887, 'Fri')
(15.31517163877781, 'Sat')
(16.689728635113457, 'Sun')
(16.127563396664705, 'Thur')


In [21]:
# 2. Retrieve the max and min bill amounts
max_min_query = "SELECT MAX(total_bill), MIN(total_bill) FROM tips"
with con:
    rows = con.execute(max_min_query)
    print("(max_bill_amount, min_bill_amount)")
    for row in rows:
        print(row)

(max_bill_amount, min_bill_amount)
(50.81, 3.07)


In [22]:
# 3. Count number of parties for each size
number_of_parties_for_each_size_query = "SELECT COUNT(*), size FROM tips GROUP BY size"
with con:
    rows = con.execute(number_of_parties_for_each_size_query)
    print("(number_of_parties, size)")
    for row in rows:
        print(row)

(number_of_parties, size)
(4, 1)
(156, 2)
(38, 3)
(37, 4)
(5, 5)
(4, 6)


In [24]:
# 4. Retrieve the total bill and tip for parties of size 4 or more, where the tip percentage is greater than 15%

complex_bill_tip_query = "SELECT total_bill, tip, size FROM tips WHERE size >= 4 AND (tip / total_bill) > 0.15"
with con:
    rows = con.execute(complex_bill_tip_query)
    print("(total_bill,tip, size)")
    for row in rows:
        print(row)

(total_bill,tip, size)
(25.29, 4.71, 4)
(18.43, 3.0, 4)
(39.42, 7.58, 4)
(30.4, 5.6, 4)
(32.4, 6.0, 4)
(25.56, 4.34, 4)
(18.29, 3.76, 4)
(29.93, 5.07, 4)
(34.3, 6.7, 6)
(27.05, 5.0, 6)
(29.85, 5.14, 5)
(21.5, 3.5, 4)
(23.17, 6.5, 4)
(20.69, 5.0, 5)
(20.53, 4.0, 4)
(25.89, 5.16, 4)
(48.33, 9.0, 4)


In [32]:
# 5. Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order

query_5 = "SELECT SUM(total_bill), SUM(tip), SUM(tip) * 100 / SUM(total_bill), day, time FROM tips GROUP BY day, time ORDER BY SUM(tip) / SUM(total_bill) DESC"
with con:
    rows = con.execute(query_5)
    print("(total_bill, tip_amount, tip_percentage)")
    for row in rows:
        print(row)

(total_bill, tip_amount, tip_percentage)
(89.92, 16.68, 18.54982206405694, 'Fri', 'Lunch')
(18.78, 3.0, 15.974440894568689, 'Thur', 'Dinner')
(1077.5499999999997, 168.83, 15.667950443134893, 'Thur', 'Lunch')
(1627.1600000000003, 247.39000000000007, 15.203790653654222, 'Sun', 'Dinner')
(235.95999999999998, 35.28, 14.951686726563826, 'Fri', 'Dinner')
(1778.3999999999996, 260.4, 14.64237516869096, 'Sat', 'Dinner')


In [33]:
# 6. Find the average tip percentage for each combination of day, time, and smoker status
avg_tip_percentage_6 = "SELECT AVG(tip / total_bill) * 100, day, time, smoker FROM tips GROUP BY day, time, smoker"
with con:
    rows = con.execute(avg_tip_percentage_6)
    print("(avg_tip_percent, day, time, smoker)")
    for row in rows:
        print(row)

(avg_tip_percent, day, time, smoker)
(13.962236590021224, 'Fri', 'Dinner', 'No')
(16.534735890795094, 'Fri', 'Dinner', 'Yes')
(18.773466833541928, 'Fri', 'Lunch', 'No')
(18.893659162141404, 'Fri', 'Lunch', 'Yes')
(15.804765754377618, 'Sat', 'Dinner', 'No')
(14.790606514920887, 'Sat', 'Dinner', 'Yes')
(16.011294305072806, 'Sun', 'Dinner', 'No')
(18.725031625235424, 'Sun', 'Dinner', 'Yes')
(15.974440894568689, 'Thur', 'Dinner', 'No')
(16.031066524359844, 'Thur', 'Lunch', 'No')
(16.38632721334178, 'Thur', 'Lunch', 'Yes')


#### Delete the db file after all execution is completed

In [7]:
import os
if os.path.exists("data/tips.db"):
    os.remove("data/tips.db")