# Assignemnt for TA2

Team: XKJX

Team Members:
  - Xiaoquan Kong
  - Jinglong Xiong

## Helper functions

In [1]:
import json
from typing import Any


def pretty_print(data: Any):
    """
    print data in a pretty way that can be viewed more clean.
    """

    print(json.dumps(data, ensure_ascii=False, indent=4))

## CREATE

### Create Database

In [2]:
# Just in case: remove old database file
!rm XKJX.db

In [3]:
import sqlite3

con = sqlite3.connect("XKJX.db")

### Create table

In [4]:
# table header of csv (for field name reference): total_bill,tip,sex,smoker,day,time,size
# data sample of csv (for value type reference): 16.99,1.01,Female,No,Sun,Dinner,2
sql = """
    CREATE TABLE tips (
        id INTEGER PRIMARY KEY,
        total_bill FLOAT,
        tip FLOAT,
        sex CHAR,
        smoker CHAR,
        day CHAR,
        time CHAR,
        size INT
    );
"""

try:
    with con:
        con.execute(sql)
except sqlite3.Error as e:
    print("couldn't create table: ", e)
else:
    print("Success: table created!")

Success: table created!


### Insert data

Insert data item using the data from the sample .csv file.

In [5]:
import csv

sql = """
    INSERT INTO
        tips
    VALUES
        (
            NULL,
            :total_bill,
            :tip,
            :sex,
            :smoker,
            :day,
            :time,
            :size
        )
    ;
"""

with open("data/tips.csv") as fd:
    reader = csv.DictReader(fd)
    for row in reader:
        # insert into table
        try:
            with con:
                con.execute(sql, row)
        except sqlite3.Error as e:
            print("couldn't insert data item: ", e)

print("Success: all data inserted!")

Success: all data inserted!


## READ

### 1. Retrieve the average tip percentage for each day of the week 

In [6]:
sql = """
    SELECT
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS avg_tip_percentage,
        day
    FROM
        tips
    GROUP BY
        day
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for avg_tip_percentage, day in res.fetchall():
        data_point = {"day": day, "avg_tip_percentage": avg_tip_percentage}
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "avg_tip_percentage": 15.94
    },
    {
        "day": "Sat",
        "avg_tip_percentage": 14.64
    },
    {
        "day": "Sun",
        "avg_tip_percentage": 15.2
    },
    {
        "day": "Thur",
        "avg_tip_percentage": 15.67
    }
]


### 2. Find the maximum and minimum total bull amounts

In [7]:
sql = """
    SELECT
        MIN(total_bill) AS min_bill,
        MAX(total_bill) AS max_bill
    FROM
        tips
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    min_bill, max_bill = res.fetchone()
    data_point = {"min_bill": min_bill, "max_bill": max_bill}
    pretty_print(data_point)

{
    "min_bill": 3.07,
    "max_bill": 50.81
}


### 3. Count the number of parties for each size

In [8]:
sql = """
    SELECT
        size,
        COUNT(*) AS party_num
    FROM
        tips 
    GROUP BY
        size
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for size, num in res.fetchall():
        data_point = {"party_size": size, "num": num}
        data.append(data_point)

    pretty_print(data)

[
    {
        "party_size": 1,
        "num": 4
    },
    {
        "party_size": 2,
        "num": 156
    },
    {
        "party_size": 3,
        "num": 38
    },
    {
        "party_size": 4,
        "num": 37
    },
    {
        "party_size": 5,
        "num": 5
    },
    {
        "party_size": 6,
        "num": 4
    }
]


### 4. Retrieve the total bill and tip for parties of size 4 or more, where the tip percentage is greater than 15%

In [9]:
sql = """
    SELECT
        total_bill,
        tip
    FROM
        tips 
    WHERE
        size >=4
        AND (tip / total_bill) * 100 > 15
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for bill, tip in res.fetchall():
        data_point = {"total_bill": bill, "total_tip": tip}
        data.append(data_point)

    pretty_print(data)

[
    {
        "total_bill": 25.29,
        "total_tip": 4.71
    },
    {
        "total_bill": 18.43,
        "total_tip": 3.0
    },
    {
        "total_bill": 39.42,
        "total_tip": 7.58
    },
    {
        "total_bill": 30.4,
        "total_tip": 5.6
    },
    {
        "total_bill": 32.4,
        "total_tip": 6.0
    },
    {
        "total_bill": 25.56,
        "total_tip": 4.34
    },
    {
        "total_bill": 18.29,
        "total_tip": 3.76
    },
    {
        "total_bill": 29.93,
        "total_tip": 5.07
    },
    {
        "total_bill": 34.3,
        "total_tip": 6.7
    },
    {
        "total_bill": 27.05,
        "total_tip": 5.0
    },
    {
        "total_bill": 29.85,
        "total_tip": 5.14
    },
    {
        "total_bill": 21.5,
        "total_tip": 3.5
    },
    {
        "total_bill": 23.17,
        "total_tip": 6.5
    },
    {
        "total_bill": 20.69,
        "total_tip": 5.0
    },
    {
        "total_bill": 20.53,
        "total_tip": 4.

### 5. Retrieve the total bill, tip amount, and tip percentage for each combination of day and time, sorted by tip percentage in descending order

In [10]:
sql = """
    SELECT
        day,
        time,
        ROUND(SUM(total_bill), 2),
        ROUND(SUM(tip), 2),
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS avg_tip_percentage
    FROM
        tips
    GROUP BY
        day,
        time
    ORDER BY
        SUM(tip) / SUM(total_bill) DESC
    ;
"""


try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for day, time, bill, total_tip, tip_percentage in res.fetchall():
        data_point = {
            "day": day,
            "time": time,
            "total_bill": bill,
            "total_tip": total_tip,
            "tip_precentage": tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "time": "Lunch",
        "total_bill": 89.92,
        "total_tip": 16.68,
        "tip_precentage": 18.55
    },
    {
        "day": "Thur",
        "time": "Dinner",
        "total_bill": 18.78,
        "total_tip": 3.0,
        "tip_precentage": 15.97
    },
    {
        "day": "Thur",
        "time": "Lunch",
        "total_bill": 1077.55,
        "total_tip": 168.83,
        "tip_precentage": 15.67
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "total_bill": 1627.16,
        "total_tip": 247.39,
        "tip_precentage": 15.2
    },
    {
        "day": "Fri",
        "time": "Dinner",
        "total_bill": 235.96,
        "total_tip": 35.28,
        "tip_precentage": 14.95
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "total_bill": 1778.4,
        "total_tip": 260.4,
        "tip_precentage": 14.64
    }
]


### 6. Find the average tip percentage for each combination of day, time, and smoker status

In [11]:
sql = """
    SELECT
        day,
        time,
        smoker,
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS avg_tip_percentage
    FROM
        tips
    GROUP BY
        day,
        time,
        smoker
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for day, time, smoker, avg_tip_percentage in res.fetchall():
        data_point = {
            "day": day,
            "time": time,
            "smoker": smoker,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 14.3
    },
    {
        "day": "Fri",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 15.16
    },
    {
        "day": "Fri",
        "time": "Lunch",
        "smoker": "No",
        "avg_tip_percentage": 18.77
    },
    {
        "day": "Fri",
        "time": "Lunch",
        "smoker": "Yes",
        "avg_tip_percentage": 18.5
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 15.78
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 13.51
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 15.45
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 14.58
    },
    {
        "day": "Thur",
 

### 7. Retrieve the total bill, tip amount, and tip percentage for each sex, sorted by total bill in descending order, and limit the results to the top 5 records

In [12]:
sql = """
    SELECT 
        sex,
        ROUND(SUM(total_bill), 2),
        ROUND(SUM(tip), 2),
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS tip_percentage
    FROM 
        tips
    GROUP BY
        sex
    ORDER BY 
        SUM(total_bill) DESC
    LIMIT
        5
    ;
"""



try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for sex, total_bill, tip, tip_percentage in res.fetchall():
        data_point = {
            "sex": sex,
            "total_bill": total_bill,
            "tip": tip,
            "tip_percentage": tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "sex": "Male",
        "total_bill": 3256.82,
        "tip": 485.07,
        "tip_percentage": 14.89
    },
    {
        "sex": "Female",
        "total_bill": 1570.95,
        "tip": 246.51,
        "tip_percentage": 15.69
    }
]


### 8. Find the maximum and minimum tip percentage for each day and time combination, along with the corresponding total bill and tip amount

In [13]:
sql = """
    SELECT
        day,
        time,
        MIN(ROUND(tip / total_bill * 100, 2)) AS min_percentage,
        MAX(ROUND(tip / total_bill * 100, 2)) AS max_percentage,
        total_bill,
        tip
    FROM
        tips
    GROUP BY
        id
"""


try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for day, time, min_percentage, max_percentage, total_bill, tip in res.fetchall():
        data_point = {
            "day": day,
            "time": time,
            "min_tip_percentage": min_percentage,
            "max_tip_percentage": max_percentage,
            "total_bill": total_bill,
            "tip": tip,
        }

        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Sun",
        "time": "Dinner",
        "min_tip_percentage": 5.94,
        "max_tip_percentage": 5.94,
        "total_bill": 16.99,
        "tip": 1.01
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "min_tip_percentage": 16.05,
        "max_tip_percentage": 16.05,
        "total_bill": 10.34,
        "tip": 1.66
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "min_tip_percentage": 16.66,
        "max_tip_percentage": 16.66,
        "total_bill": 21.01,
        "tip": 3.5
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "min_tip_percentage": 13.98,
        "max_tip_percentage": 13.98,
        "total_bill": 23.68,
        "tip": 3.31
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "min_tip_percentage": 14.68,
        "max_tip_percentage": 14.68,
        "total_bill": 24.59,
        "tip": 3.61
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "min_tip_percentage

### 9. Retrieve the total bill, tip amount, and tip percentage for parties of size 4 or more, where the tip percentage is greater than 15%, and the total bill is between $50 and $100

In [14]:
sql = """
    SELECT
        total_bill,
        tip,
        ROUND(tip / total_bill * 100, 2) AS avg_tip_percentage
    FROM
        tips
    WHERE
        size >= 4
        AND total_bill BETWEEN 50 AND 100
        AND tip / total_bill * 100 > 15
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for bill, tip, tip_percentage in res.fetchall():
        data_point = {
            "bill": bill,
            "tip": tip,
            "tip_percentage": tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[]


### 10. Find the average tip percentage for each combination of day, time, and smoker status, but only include combinations with more than 5 records

In [15]:
sql = """
    SELECT
        day,
        time,
        smoker,
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS avg_tip_percentage
    FROM
        tips
    GROUP BY
        day,
        time,
        smoker
    HAVING
        COUNT(*) > 5
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for day, time, smoker, avg_tip_percentage in res.fetchall():
        data_point = {
            "day": day,
            "time": time,
            "smoker": smoker,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 15.16
    },
    {
        "day": "Fri",
        "time": "Lunch",
        "smoker": "Yes",
        "avg_tip_percentage": 18.5
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 15.78
    },
    {
        "day": "Sat",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 13.51
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "smoker": "No",
        "avg_tip_percentage": 15.45
    },
    {
        "day": "Sun",
        "time": "Dinner",
        "smoker": "Yes",
        "avg_tip_percentage": 14.58
    },
    {
        "day": "Thur",
        "time": "Lunch",
        "smoker": "No",
        "avg_tip_percentage": 15.62
    },
    {
        "day": "Thur",
        "time": "Lunch",
        "smoker": "Yes",
        "avg_tip_percentage": 15.79
    }
]


### 11. (additional queries) average daily bill and average tip percentage of each day

In [16]:
sql = """
    SELECT
        day,
        SUM(total_bill),
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS avg_tip_percentage
    FROM
        tips
    GROUP BY
        day
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for day, sum_total_bill, avg_tip_percentage in res.fetchall():
        data_point = {
            "day": day,
            "sum_total_bill": sum_total_bill,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "day": "Fri",
        "sum_total_bill": 325.87999999999994,
        "avg_tip_percentage": 15.94
    },
    {
        "day": "Sat",
        "sum_total_bill": 1778.3999999999996,
        "avg_tip_percentage": 14.64
    },
    {
        "day": "Sun",
        "sum_total_bill": 1627.1600000000003,
        "avg_tip_percentage": 15.2
    },
    {
        "day": "Thur",
        "sum_total_bill": 1096.3299999999997,
        "avg_tip_percentage": 15.67
    }
]


### 12. (additional queries) average tip percentage of bills below (exclusive) $25 and above (inclusive) $25

In [17]:
sql = """
    SELECT
        total_bill >= 25 as bill_above_25,
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS avg_tip_percentage
    FROM
        tips
    GROUP BY
        total_bill >= 25
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for bill_above_25, avg_tip_percentage in res.fetchall():
        data_point = {
            "bill_above_25": bill_above_25,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "bill_above_25": 0,
        "avg_tip_percentage": 16.46
    },
    {
        "bill_above_25": 1,
        "avg_tip_percentage": 13.06
    }
]


### 13. (additional queries) average bill and average tip percentage of smoker and non-smoker

In [18]:
sql = """
    SELECT
        smoker,
        AVG(total_bill),
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS avg_tip_percentage
    FROM
        tips
    GROUP BY
        smoker
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for smoker, avg_total_bill, avg_tip_percentage in res.fetchall():
        data_point = {
            "smoker": smoker,
            "avg_total_bill": avg_total_bill,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "smoker": "No",
        "avg_total_bill": 19.18827814569537,
        "avg_tip_percentage": 15.59
    },
    {
        "smoker": "Yes",
        "avg_total_bill": 20.756344086021507,
        "avg_tip_percentage": 14.5
    }
]


### 14. (additional queries) average bill and average tip percentage of male and female

In [19]:
sql = """
    SELECT
        sex,
        AVG(total_bill),
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS avg_tip_percentage
    FROM
        tips
    GROUP BY
        sex
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for sex, avg_total_bill, avg_tip_percentage in res.fetchall():
        data_point = {
            "sex": sex,
            "avg_total_bill": avg_total_bill,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "sex": "Female",
        "avg_total_bill": 18.056896551724137,
        "avg_tip_percentage": 15.69
    },
    {
        "sex": "Male",
        "avg_total_bill": 20.744076433121034,
        "avg_tip_percentage": 14.89
    }
]


### 15. (additional queries) average bill and average tip percentage of different party size

In [20]:
sql = """
    SELECT
        size,
        AVG(total_bill),
        ROUND(SUM(tip) / SUM(total_bill) * 100, 2) AS avg_tip_percentage
    FROM
        tips
    GROUP BY
        size
    ;
"""

try:
    with con:
        res = con.execute(sql)
except sqlite3.Error as e:
    print("couldn't execute sql: ", e)
else:
    data = []

    for size, avg_total_bill, avg_tip_percentage in res.fetchall():
        data_point = {
            "party_size": size,
            "avg_total_bill": avg_total_bill,
            "avg_tip_percentage": avg_tip_percentage,
        }
        data.append(data_point)

    pretty_print(data)

[
    {
        "party_size": 1,
        "avg_total_bill": 7.2425,
        "avg_tip_percentage": 19.85
    },
    {
        "party_size": 2,
        "avg_total_bill": 16.448012820512833,
        "avg_tip_percentage": 15.7
    },
    {
        "party_size": 3,
        "avg_total_bill": 23.27763157894737,
        "avg_tip_percentage": 14.58
    },
    {
        "party_size": 4,
        "avg_total_bill": 28.61351351351351,
        "avg_tip_percentage": 14.45
    },
    {
        "party_size": 5,
        "avg_total_bill": 30.068,
        "avg_tip_percentage": 13.4
    },
    {
        "party_size": 6,
        "avg_total_bill": 34.83,
        "avg_tip_percentage": 15.0
    }
]


## UPDATE

It was determined that there was an error in the database. Please update the record that corresponds to id=10 and set smoker to Yes. 

In [21]:
sql = """
    UPDATE
        tips
    SET
        smoker='Yes'
    WHERE
        id=10
    ;
"""

try:
    with con:
        con.execute(sql)
except sqlite3.Error as e:
    print("couldn't update data: ", e)
else:
    print("Success: data updated!")

## DELETE

Delete records from the database that have a total bill that is less than $10. 

In [22]:
sql = """
    DELETE FROM
        tips
    WHERE
        total_bill < 10
    ;
"""

try:
    with con:
        con.execute(sql)
except sqlite3.Error as e:
    print("couldn't delete data: ", e)
else:
    print("Success: data deleted!")

## Close connection

In [23]:
# at the end, we need to close the connection
con.close()