# 3. Create a Single Table

In [1]:
import pandas as pd
import sqlite3

conn = sqlite3.connect('../Data/online_shop.db')

In [2]:
# view the transactions table
transactions = pd.read_sql('SELECT * FROM transactions', conn)
transactions.head()

Unnamed: 0,customer,item_id,purchase_date
0,Ava,1011,4/1/23
1,Ava,1014,4/1/23
2,Ava,1015,4/15/23
3,Ava,1018,5/1/23
4,Ben,2345,4/15/23


In [3]:
# view the items table
items = pd.read_sql('SELECT * FROM items', conn)
items.head()

Unnamed: 0,item_id,item_description,price,category,rating
0,1011,Paint,$15.99,Arts & Crafts,3.5
1,1012,Crayons,$2.87,Arts & Crafts,4.7
2,1013,Markers,$2.50,Arts & Crafts,4.8
3,1014,Brush,$1.99,Arts & Crafts,4.2
4,1015,Paper,$22.49,Arts & Crafts,4.5


In [4]:
# merge two tables together
df = transactions.merge(items, how='left', on='item_id')
df

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
0,Ava,1011,4/1/23,Paint,$15.99,Arts & Crafts,3.5
1,Ava,1014,4/1/23,Brush,$1.99,Arts & Crafts,4.2
2,Ava,1015,4/15/23,Paper,$22.49,Arts & Crafts,4.5
3,Ava,1018,5/1/23,Scissors,$3.50,Arts & Crafts,4.6
4,Ben,2345,4/15/23,Dog Food,$29.99,Pet Supplies,4.9
...,...,...,...,...,...,...,...
105,Jenny,1011,4/20/23,Paint,$15.99,Arts & Crafts,3.5
106,Jenny,1014,4/20/23,Brush,$1.99,Arts & Crafts,4.2
107,Jenny,1018,4/20/23,Scissors,$3.50,Arts & Crafts,4.6
108,Jenny,1012,4/20/23,Crayons,$2.87,Arts & Crafts,4.7


# 4. Prepare Rows for Modeling

Goal: Predict which cusotomers are most likely to buy dog food in june.  
Each row should contain data for one customer.  
y: june dog food purchases.  
x: aggregation based on april and may data.  

In [5]:
df

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
0,Ava,1011,4/1/23,Paint,$15.99,Arts & Crafts,3.5
1,Ava,1014,4/1/23,Brush,$1.99,Arts & Crafts,4.2
2,Ava,1015,4/15/23,Paper,$22.49,Arts & Crafts,4.5
3,Ava,1018,5/1/23,Scissors,$3.50,Arts & Crafts,4.6
4,Ben,2345,4/15/23,Dog Food,$29.99,Pet Supplies,4.9
...,...,...,...,...,...,...,...
105,Jenny,1011,4/20/23,Paint,$15.99,Arts & Crafts,3.5
106,Jenny,1014,4/20/23,Brush,$1.99,Arts & Crafts,4.2
107,Jenny,1018,4/20/23,Scissors,$3.50,Arts & Crafts,4.6
108,Jenny,1012,4/20/23,Crayons,$2.87,Arts & Crafts,4.7


In [6]:
df.dtypes

customer            object
item_id              int64
purchase_date       object
item_description    object
price               object
category            object
rating              object
dtype: object

In [7]:
# convert data types to datetime and numeric fields
df.purchase_date = pd.to_datetime(df.purchase_date, format='%m/%d/%y')
df.price = pd.to_numeric(df.price.str.replace('$', ''))
df.rating = pd.to_numeric(df.rating)

In [8]:
df.dtypes

customer                    object
item_id                      int64
purchase_date       datetime64[ns]
item_description            object
price                      float64
category                    object
rating                     float64
dtype: object

In [9]:
# Create a subset of april and may data
df_april_may = df[df.purchase_date.dt.month < 6]
df_april_may.head()

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
0,Ava,1011,2023-04-01,Paint,15.99,Arts & Crafts,3.5
1,Ava,1014,2023-04-01,Brush,1.99,Arts & Crafts,4.2
2,Ava,1015,2023-04-15,Paper,22.49,Arts & Crafts,4.5
3,Ava,1018,2023-05-01,Scissors,3.5,Arts & Crafts,4.6
4,Ben,2345,2023-04-15,Dog Food,29.99,Pet Supplies,4.9


In [10]:
# Create a subset of june data
df_june = df[df.purchase_date.dt.month == 6]
df_june.head()

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
6,Ben,2345,2023-06-15,Dog Food,29.99,Pet Supplies,4.9
12,Chloe,2345,2023-06-06,Dog Food,29.99,Pet Supplies,4.9
24,Blake,2545,2023-06-10,Cat Food,25.55,Pet Supplies,4.2
34,Maxwell,5674,2023-06-02,Video Game,30.99,Games & Toys,4.1
36,Isabel,3811,2023-06-15,Socks,7.5,Apparel,3.7


In [11]:
# Create a column of june dog food purchases
dog_food_rows = df_june[df_june.item_description == 'Dog Food']

june_dog_food_purchases = (
    dog_food_rows
    .groupby('customer')['item_id']
    .count()
    .rename('june dog food purchases')
)

june_dog_food_purchases

customer
Aiden     1
Ben       1
Calvin    1
Chloe     1
Lia       1
Lily      1
Olivia    1
Name: june dog food purchases, dtype: int64

In [12]:
# How much did each customer spend in april and may?
total_spend = df_april_may.groupby('customer')['price'].sum().rename('total_spend')
total_spend

customer
Aiden       222.16
Ava          43.97
Ben          44.19
Bennett      27.73
Blake        25.55
Calvin       29.99
Chloe        36.33
Daniel       17.46
Evelyn       66.19
Gavin        39.47
Henry       112.42
Isabel        2.79
Jenny        49.34
Kate         83.25
Lia          78.95
Lily         69.31
Madeline    122.63
Margaret      7.99
Maxwell      78.31
Nolan        67.51
Olivia       68.03
Sophie        2.57
Name: total_spend, dtype: float64

In [13]:
# each row now represents a customer
model_df = (pd.concat([june_dog_food_purchases, total_spend], axis=1)
           .fillna(0).reset_index()
           .rename(columns={'index':'customer'})
           )

model_df

Unnamed: 0,customer,june dog food purchases,total_spend
0,Aiden,1.0,222.16
1,Ben,1.0,44.19
2,Calvin,1.0,29.99
3,Chloe,1.0,36.33
4,Lia,1.0,78.95
5,Lily,1.0,69.31
6,Olivia,1.0,68.03
7,Ava,0.0,43.97
8,Bennett,0.0,27.73
9,Blake,0.0,25.55
