# Team Assignment 2 - SQL

### Team Impasta - Dayeon Kang, Shaunak Badani

Links used:

- [List of dataframe headers](https://stackoverflow.com/questions/19482970/get-a-list-from-pandas-dataframe-column-headers)
- [Join an array of strings](https://www.geeksforgeeks.org/python-string-join-method/)
- [Sqlite3 Documentation](https://docs.python.org/3/library/sqlite3.html#sqlite3-connection-context-manager)
- [Dataframe to array of tuples](https://stackoverflow.com/questions/9758450/pandas-convert-dataframe-to-array-of-tuples)

In [1]:
import pandas as pd
import sqlite3

#### CREATE: Creating the sqlite3 file from the csv

In [2]:
tips_dataframe = pd.read_csv("data/tips.csv")

In [3]:
database_column_headers = tips_dataframe.columns.values
table_headers_string = "tips(" + ",".join(database_column_headers) + ")"

In [4]:
# Creating the database, and the table schema

try:
    with sqlite3.connect("data/tips.db") as con:
        con.execute(f"CREATE TABLE {table_headers_string}")
        print("Database successfully created")
except Exception as e:
    print(e.with_traceback(None))

Database successfully created


In [5]:
questions_string = f"({','.join(['?' for _ in database_column_headers])})"
database_rows = list(tips_dataframe.itertuples(index = False, name = None))

In [6]:
try:
    with sqlite3.connect("data/tips.db") as con:
        con.executemany(f"INSERT INTO {table_headers_string} VALUES {questions_string}", database_rows)
except Exception as e:
    print(e.with_traceback(None))


#### READ: Reading from the CSV

In [7]:
def execute_query(query, header = None):
    if header is not None:
        print(header)
    try:
        with sqlite3.connect("data/tips.db") as con:
            rows = con.execute(query)
            for row in rows:
                print(row)
    except Exception as e:
        print(e.with_traceback(None))

In [8]:
# 1. Retrieve the average tip percentage for each day of the week
average_tip_query = """
            SELECT ROUND(SUM(tip) * 100 / SUM(total_bill), 3) , day 
            FROM tips 
            GROUP BY day 
            ORDER BY day
        """
execute_query(average_tip_query, "(avg_tip_percentage, day)")

(avg_tip_percentage, day)
(15.945, 'Fri')
(14.642, 'Sat')
(15.204, 'Sun')
(15.673, 'Thur')


In [9]:
# 2. Retrieve the max and min bill amounts
max_min_query = "SELECT MAX(total_bill), MIN(total_bill) FROM tips"
execute_query(max_min_query, "(max_bill_amount, min_bill_amount)")

(max_bill_amount, min_bill_amount)
(50.81, 3.07)


In [10]:
# 3. Count number of parties for each size
number_of_parties_for_each_size_query = "SELECT COUNT(*), size FROM tips GROUP BY size"
execute_query(number_of_parties_for_each_size_query, "(number_of_parties, size)")

(number_of_parties, size)
(4, 1)
(156, 2)
(38, 3)
(37, 4)
(5, 5)
(4, 6)


In [11]:
# 4. Retrieve the total bill and tip for parties of size 4 or more, where the tip percentage is greater than 15%

complex_bill_tip_query = """
        SELECT total_bill, tip, size 
        FROM tips 
        WHERE size >= 4 
        AND (tip / total_bill) > 0.15
        """
execute_query(complex_bill_tip_query, "(total_bill,tip, size)")

(total_bill,tip, size)
(25.29, 4.71, 4)
(18.43, 3.0, 4)
(39.42, 7.58, 4)
(30.4, 5.6, 4)
(32.4, 6.0, 4)
(25.56, 4.34, 4)
(18.29, 3.76, 4)
(29.93, 5.07, 4)
(34.3, 6.7, 6)
(27.05, 5.0, 6)
(29.85, 5.14, 5)
(21.5, 3.5, 4)
(23.17, 6.5, 4)
(20.69, 5.0, 5)
(20.53, 4.0, 4)
(25.89, 5.16, 4)
(48.33, 9.0, 4)


In [12]:
# 5. Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order

query_5 = "SELECT SUM(total_bill), SUM(tip), SUM(tip) * 100 / SUM(total_bill), day, time FROM tips GROUP BY day, time ORDER BY SUM(tip) / SUM(total_bill) DESC"
execute_query(query_5, "(total_bill, tip_amount, tip_percentage)")

(total_bill, tip_amount, tip_percentage)
(89.92, 16.68, 18.54982206405694, 'Fri', 'Lunch')
(18.78, 3.0, 15.974440894568689, 'Thur', 'Dinner')
(1077.5499999999997, 168.83, 15.667950443134893, 'Thur', 'Lunch')
(1627.1600000000003, 247.39000000000007, 15.203790653654222, 'Sun', 'Dinner')
(235.95999999999998, 35.28, 14.951686726563826, 'Fri', 'Dinner')
(1778.3999999999996, 260.4, 14.64237516869096, 'Sat', 'Dinner')


In [13]:
# 6. Find the average tip percentage for each combination of day, time, and smoker status
avg_tip_percentage_6 = """
    SELECT ROUND(SUM(tip) * 100 / SUM(total_bill), 3), day, time, smoker 
    FROM tips 
    GROUP BY day, time, smoker"""
execute_query(avg_tip_percentage_6, "(avg_tip_percent, day, time, smoker)")

(avg_tip_percent, day, time, smoker)
(14.298, 'Fri', 'Dinner', 'No')
(15.163, 'Fri', 'Dinner', 'Yes')
(18.773, 'Fri', 'Lunch', 'No')
(18.501, 'Fri', 'Lunch', 'Yes')
(15.781, 'Sat', 'Dinner', 'No')
(13.515, 'Sat', 'Dinner', 'Yes')
(15.448, 'Sun', 'Dinner', 'No')
(14.581, 'Sun', 'Dinner', 'Yes')
(15.974, 'Thur', 'Dinner', 'No')
(15.615, 'Thur', 'Lunch', 'No')
(15.789, 'Thur', 'Lunch', 'Yes')


In [14]:
# 7. Retrieve the total bill, tip amount, and tip percentage for each sex, sorted by total bill in descending order, and limit the results to the top 5 records
query_7 = """
    SELECT ROUND(SUM(total_bill), 3), ROUND(SUM(tip), 3), ROUND(SUM(tip) * 100 / SUM(total_bill), 3), sex
    FROM tips 
    GROUP BY sex 
    ORDER BY SUM(total_bill) DESC 
    LIMIT 5
    """
execute_query(query_7, "(total_bill, tip_amount, tip_percentage, sex)")

(total_bill, tip_amount, tip_percentage, sex)
(3256.82, 485.07, 14.894, 'Male')
(1570.95, 246.51, 15.692, 'Female')


In [15]:
# 8. Find the maximum and minimum tip percentage for each day and time combination, along with the corresponding total bill and tip amount

query_8_1 = "SELECT MAX(tip * 100 / total_bill) , tip, total_bill, day, time FROM tips GROUP BY day, time"
execute_query(query_8_1, "(max_tip_percentage, tip, total_bill, day, time)")

print("-----------------")
query_8_2 = "SELECT MIN(tip * 100 / total_bill) , tip, total_bill, day, time FROM tips GROUP BY day, time"
execute_query(query_8_2, "(min_tip_percentage, tip, total_bill, day, time)")
    

(max_tip_percentage, tip, total_bill, day, time)
(26.348039215686274, 4.3, 16.32, 'Fri', 'Dinner')
(25.93144560357675, 3.48, 13.42, 'Fri', 'Lunch')
(32.57328990228013, 1.0, 3.07, 'Sat', 'Dinner')
(71.03448275862068, 5.15, 7.25, 'Sun', 'Dinner')
(15.974440894568689, 3.0, 18.78, 'Thur', 'Dinner')
(26.63115845539281, 2.0, 7.51, 'Thur', 'Lunch')
-----------------
(min_tip_percentage, tip, total_bill, day, time)
(10.355540214014498, 3.0, 28.97, 'Fri', 'Dinner')
(11.773472429210134, 1.58, 13.42, 'Fri', 'Lunch')
(3.563813585135547, 1.17, 32.83, 'Sat', 'Dinner')
(5.9446733372572105, 1.01, 16.99, 'Sun', 'Dinner')
(15.974440894568689, 3.0, 18.78, 'Thur', 'Dinner')
(7.296137339055794, 1.36, 18.64, 'Thur', 'Lunch')


In [16]:
# 9. Retrieve the total bill, tip amount, and tip percentage for parties of size 4 or more, where the tip percentage is greater than 15%, and the total bill is between $50 and $100
query_9 = """
    SELECT total_bill, tip, (tip * 100 / total_bill), size 
    FROM tips 
    WHERE size >=4 
    AND (tip / total_bill) > 0.15 
    AND total_bill BETWEEN 50 AND 100
    """
execute_query(query_9, "(total_bill, tip, tip_percentage)")

(total_bill, tip, tip_percentage)


In [17]:
# 10. Find the average tip percentage for each combination of day, time, and smoker status, but only include combinations with more than 5 records
query_10 = """
    SELECT ROUND(SUM(tip) * 100 / SUM(total_bill), 3), day, time, smoker 
    FROM tips 
    GROUP BY day, time, smoker 
    HAVING COUNT(*) > 5
    """
execute_query(query_10, "(avg_tip_percentage, day, time, smoker)")

(avg_tip_percentage, day, time, smoker)
(15.163, 'Fri', 'Dinner', 'Yes')
(18.501, 'Fri', 'Lunch', 'Yes')
(15.781, 'Sat', 'Dinner', 'No')
(13.515, 'Sat', 'Dinner', 'Yes')
(15.448, 'Sun', 'Dinner', 'No')
(14.581, 'Sun', 'Dinner', 'Yes')
(15.615, 'Thur', 'Lunch', 'No')
(15.789, 'Thur', 'Lunch', 'Yes')


#### READ: 5 additional queries

In [18]:
# 1. Find the average tip percentage grouped by sex
avg_tip_percentage_grouped_by_sex = """
    SELECT ROUND(SUM(tip) * 100 / SUM(total_bill),3), sex
    FROM tips
    GROUP BY sex
    """
execute_query(avg_tip_percentage_grouped_by_sex, "(avg_tip_percent, sex)")

(avg_tip_percent, sex)
(15.692, 'Female')
(14.894, 'Male')


In [19]:
# 2. Find the highest tip among each group size
highest_tip_among_each_group_size = """
    SELECT MAX(tip), size
    FROM tips
    GROUP BY size
    """
execute_query(highest_tip_among_each_group_size, "(highest_tp, size)")

(highest_tp, size)
(1.92, 1)
(5.85, 2)
(10.0, 3)
(9.0, 4)
(5.14, 5)
(6.7, 6)


In [20]:
# 3. Find the number of groups who tipped more than $7
groups_tipping_more_than_7 = """
    SELECT tip, size
    FROM tips
    WHERE tip > 7
    """
execute_query(groups_tipping_more_than_7, "(tip, size)")

(tip, size)
(7.58, 4)
(10.0, 3)
(9.0, 4)


In [21]:
# 4. Find out if people collectively tipped more on weekends rather than weekdays
tips_grouped_by_day = """
    SELECT ROUND(SUM(tip), 3), day
    FROM tips
    GROUP BY day
    """
execute_query(tips_grouped_by_day, "(total_tips, day)")

(total_tips, day)
(51.96, 'Fri')
(260.4, 'Sat')
(247.39, 'Sun')
(171.83, 'Thur')


In [22]:
# 5. Find count of groups for each size who spent more than $40 on their orders
count_groups = """
    SELECT COUNT(*), size
    FROM tips
    WHERE total_bill > 40
    GROUP BY size
    """
execute_query(count_groups, "(count_of_groups, size)")

(count_of_groups, size)
(1, 2)
(3, 3)
(4, 4)
(1, 5)
(1, 6)


#### UPDATE: update the record that corresponds to id=10 and set smoker to Yes.

In [23]:
update_id_10 = """
    UPDATE tips
    SET smoker = 'Yes' 
    WHERE rowid = 10
    """

select_query_for_checking = "SELECT * FROM tips WHERE rowid = 10"
execute_query(select_query_for_checking, "(row with rowid = 10)")
try:
    with sqlite3.connect("data/tips.db") as con:
        cur = con.cursor()
        cur.execute(update_id_10)
        con.commit()
except Exception as e:
    print("Exception:", e.with_traceback(None))
execute_query(select_query_for_checking, "(row with rowid = 10)")

(row with rowid = 10)
(14.78, 3.23, 'Male', 'No', 'Sun', 'Dinner', 2)
(row with rowid = 10)
(14.78, 3.23, 'Male', 'Yes', 'Sun', 'Dinner', 2)


#### DELETE: Delete records from the database that have a total bill that is less than $10.

In [24]:
total_bill_less_than_10 = """
    SELECT *
    FROM tips
    WHERE total_bill < 10
    """

execute_query(total_bill_less_than_10, "(records_less_than_10)")

delete_records_less_than_10 = """
    DELETE
    FROM tips
    WHERE total_bill < 10
    """

print("------")
try:
    with sqlite3.connect("data/tips.db") as con:
        cur = con.cursor()
        cur.execute(delete_records_less_than_10)
        print("Deleting records")
        con.commit()
except Exception as e:
    print("Exception:", e.with_traceback(None))

print("------")
execute_query(total_bill_less_than_10, "(records_less_than_10)")

(records_less_than_10)
(8.77, 2.0, 'Male', 'No', 'Sun', 'Dinner', 2)
(9.55, 1.45, 'Male', 'No', 'Sat', 'Dinner', 2)
(9.68, 1.32, 'Male', 'No', 'Sun', 'Dinner', 2)
(9.94, 1.56, 'Male', 'No', 'Sun', 'Dinner', 2)
(3.07, 1.0, 'Female', 'Yes', 'Sat', 'Dinner', 1)
(5.75, 1.0, 'Female', 'Yes', 'Fri', 'Dinner', 2)
(7.25, 1.0, 'Female', 'No', 'Sat', 'Dinner', 1)
(8.52, 1.48, 'Male', 'No', 'Thur', 'Lunch', 2)
(8.51, 1.25, 'Female', 'No', 'Thur', 'Lunch', 2)
(8.35, 1.5, 'Female', 'No', 'Thur', 'Lunch', 2)
(9.78, 1.73, 'Male', 'No', 'Thur', 'Lunch', 2)
(7.51, 2.0, 'Male', 'No', 'Thur', 'Lunch', 2)
(7.25, 5.15, 'Male', 'Yes', 'Sun', 'Dinner', 2)
(9.6, 4.0, 'Female', 'Yes', 'Sun', 'Dinner', 2)
(7.56, 1.44, 'Male', 'No', 'Thur', 'Lunch', 2)
(7.74, 1.44, 'Male', 'Yes', 'Sat', 'Dinner', 2)
(8.58, 1.92, 'Male', 'Yes', 'Fri', 'Lunch', 1)
------
Deleting records
------
(records_less_than_10)


#### Delete the db file after all execution is completed

In [25]:
import os
if os.path.exists("data/tips.db"):
    os.remove("data/tips.db")