## Creating a single table

- We can use either appending or joining methods to create tables

#### Appending
- We use pd.concat([df1,df2]) where df1,df2 are dataframes
- Append stacks the datasets verticaly. Data must have identical columns

In [63]:
# Importing the necessary libraries
import pandas as pd

In [64]:
# Loading the needed datasets
canada = pd.read_csv('Data/happiness_data_ca.txt')
mexico = pd.read_csv('Data/happiness_data_mx.txt')
usa = pd.read_csv('Data/happiness_data_us.txt')
costa = pd.read_csv('Data/happiness_data_cr.txt')

print("Data loaded sucessfully")

Data loaded sucessfully


In [65]:
# Displaying the canada dataset
canada.head(5)

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
0,Canada,2005,7.418048,0.961552,0.957306,70.5
1,Canada,2007,7.481753,,0.930341,70.620003
2,Canada,2008,7.485604,0.938707,0.926315,70.68
3,Canada,2009,7.487824,0.942845,0.915058,70.739998
4,Canada,2010,7.650346,0.953765,0.933949,70.800003


In [66]:
# Displaying the mexico dataset
mexico.head(5)

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
0,Mexico,2005,6.580658,0.902808,0.813745,64.400002
1,Mexico,2007,6.525378,0.878806,0.67043,64.68
2,Mexico,2008,6.829036,0.876328,0.677477,64.82
3,Mexico,2009,6.962819,0.868221,0.682463,64.959999
4,Mexico,2010,6.802389,0.87639,0.778121,65.099998


In [67]:
# Displaying the usa dataset
usa.head(5)

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices
0,United States,2006,7.181794,0.964572,0.911496
1,United States,2007,7.512688,,0.871904
2,United States,2008,7.280386,0.952587,0.877956
3,United States,2009,7.158032,0.911794,0.830684
4,United States,2010,7.163616,0.926159,0.828044


In [68]:
# Displaying the costa rica dataset
costa.head(5)

Unnamed: 0,nombre del país,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
0,Costa Rica,2006,7.082465,0.936938,0.88242,68.559998
1,Costa Rica,2007,7.432132,0.917678,0.922736,68.720001
2,Costa Rica,2008,6.85068,0.915759,0.912006,68.879997
3,Costa Rica,2009,7.614929,0.899782,0.886061,69.040001
4,Costa Rica,2010,7.271054,0.915141,0.88103,69.199997


In [69]:
# Appending the canada and the usa data
pd.concat([canada,usa]).head(5)

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
0,Canada,2005,7.418048,0.961552,0.957306,70.5
1,Canada,2007,7.481753,,0.930341,70.620003
2,Canada,2008,7.485604,0.938707,0.926315,70.68
3,Canada,2009,7.487824,0.942845,0.915058,70.739998
4,Canada,2010,7.650346,0.953765,0.933949,70.800003


In [70]:
# Since the usa didn't have the healthy life expectancy at birth the column is appended as NaN
pd.concat([canada,usa]).tail(5)

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
11,United States,2017,6.991759,0.921003,0.868497,
12,United States,2018,6.882685,0.903856,0.824607,
13,United States,2019,6.943701,0.916691,0.836139,
14,United States,2020,7.028088,0.93737,0.850447,
15,United States,2021,6.959088,0.920009,0.815735,


In [71]:
# Appending the mexico and costa rica datasets
# The nombre del pais is the country name so we can rename that column
pd.concat([mexico,costa]).head(5)

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth,nombre del país
0,Mexico,2005,6.580658,0.902808,0.813745,64.400002,
1,Mexico,2007,6.525378,0.878806,0.67043,64.68,
2,Mexico,2008,6.829036,0.876328,0.677477,64.82,
3,Mexico,2009,6.962819,0.868221,0.682463,64.959999,
4,Mexico,2010,6.802389,0.87639,0.778121,65.099998,


In [72]:
# The country name for costa rica is a nan
pd.concat([mexico,costa]).tail(5)

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth,nombre del país
11,,2017,7.225182,0.921697,0.935618,70.0,Costa Rica
12,,2018,7.141075,0.875872,0.941888,70.0,Costa Rica
13,,2019,6.997619,0.906077,0.92683,70.0,Costa Rica
14,,2020,6.338472,0.8342,0.8894,70.0,Costa Rica
15,,2021,6.408448,0.876052,0.886652,70.0,Costa Rica


In [73]:
# Renaming the costa rica country name
df = pd.concat([mexico,canada,costa.rename(columns = {'nombre del país':'country_name'})])
df

Unnamed: 0,country_name,year,happiness_score,social_support,freedom_to_make_life_choices,healthy_life_expectancy_at_birth
0,Mexico,2005,6.580658,0.902808,0.813745,64.400002
1,Mexico,2007,6.525378,0.878806,0.67043,64.68
2,Mexico,2008,6.829036,0.876328,0.677477,64.82
3,Mexico,2009,6.962819,0.868221,0.682463,64.959999
4,Mexico,2010,6.802389,0.87639,0.778121,65.099998
5,Mexico,2011,6.909515,0.824064,0.831368,65.239998
6,Mexico,2012,7.320185,0.767279,0.787768,65.379997
7,Mexico,2013,7.442546,0.759138,0.738717,65.519997
8,Mexico,2014,6.679831,0.781965,0.779133,65.660004
9,Mexico,2015,6.236287,0.760614,0.719466,65.800003


### Merge
- Tables to be joined must have identical columns
- We use df1.merge(df2,how,left_on,right_on) where df1,df2 are the left and right datasets respectively. how specifies the type of join and left_on and right_on specifies the columns to be merged on.

In [74]:
# Loading the data
sales_may = pd.read_excel('Data/Sales Tables.xlsx', sheet_name=0)
sales_june = pd.read_excel('Data/Sales Tables.xlsx', sheet_name=1)
regions = pd.read_excel('Data/Sales Tables.xlsx', sheet_name=2)

print("Data is loaded successfuly")

Data is loaded successfuly


In [75]:
# left dataset
sales_may.head()

Unnamed: 0,date,store,sales
0,2022-05-01,1,341
1,2022-05-01,2,291
2,2022-05-01,3,493
3,2022-05-01,4,428
4,2022-05-01,5,152


In [76]:
# right dataset
sales_june.head()

Unnamed: 0,date,store,sales
0,2022-06-01,1,67
1,2022-06-01,2,144
2,2022-06-01,3,226
3,2022-06-01,4,397
4,2022-06-01,5,163


In [77]:
# Regions
regions

Unnamed: 0,store,region
0,2,North
1,3,East
2,4,West
3,8,South


In [78]:
# merging the dataset of may and regions
sales_may.merge(regions,how='left',left_on='store',right_on='store')

Unnamed: 0,date,store,sales,region
0,2022-05-01,1,341,
1,2022-05-01,2,291,North
2,2022-05-01,3,493,East
3,2022-05-01,4,428,West
4,2022-05-01,5,152,


In [79]:
# merging the datset of june and regions
# When we refuse to specify the columns we're using the default type of join is an inner join
sales_june.merge(regions)

Unnamed: 0,date,store,sales,region
0,2022-06-01,2,144,North
1,2022-06-01,3,226,East
2,2022-06-01,4,397,West


In [80]:
# When we know the columns we want to merge with we can use them instead of specifying them twice
sales_may.merge(regions,how='outer',on='store')

Unnamed: 0,date,store,sales,region
0,2022-05-01,1,341.0,
1,2022-05-01,2,291.0,North
2,2022-05-01,3,493.0,East
3,2022-05-01,4,428.0,West
4,2022-05-01,5,152.0,
5,NaT,8,,South


In [81]:
# Performing a right join
sales_june.merge(regions,how='right',on='store')

Unnamed: 0,date,store,sales,region
0,2022-06-01,2,144.0,North
1,2022-06-01,3,226.0,East
2,2022-06-01,4,397.0,West
3,NaT,8,,South


### Creating a single table and Join types

In [82]:
# Loading in the data
import sqlite3
conn = sqlite3.connect('Data/online_shop.db')

print("Connection created successfuly")

Connection created successfuly


In [83]:
# viewing the transactions table
transactions = pd.read_sql('select * from transactions',conn)
transactions.head()

Unnamed: 0,customer,item_id,purchase_date
0,Ava,1011,4/1/23
1,Ava,1014,4/1/23
2,Ava,1015,4/15/23
3,Ava,1018,5/1/23
4,Ben,2345,4/15/23


In [84]:
# viewing the items table
items = pd.read_sql('select * from items',conn)
items.head()

Unnamed: 0,item_id,item_description,price,category,rating
0,1011,Paint,$15.99,Arts & Crafts,3.5
1,1012,Crayons,$2.87,Arts & Crafts,4.7
2,1013,Markers,$2.50,Arts & Crafts,4.8
3,1014,Brush,$1.99,Arts & Crafts,4.2
4,1015,Paper,$22.49,Arts & Crafts,4.5


In [85]:
# Merging the tables to form a single table called items transactions
items_transactions = transactions.merge(items,how='left',on='item_id')
items_transactions.tail()

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
105,Jenny,1011,4/20/23,Paint,$15.99,Arts & Crafts,3.5
106,Jenny,1014,4/20/23,Brush,$1.99,Arts & Crafts,4.2
107,Jenny,1018,4/20/23,Scissors,$3.50,Arts & Crafts,4.6
108,Jenny,1012,4/20/23,Crayons,$2.87,Arts & Crafts,4.7
109,Jenny,1015,4/20/23,Paper,$22.49,Arts & Crafts,4.5


In [86]:
# Performing an inner merge
items_transactions = transactions.merge(items,how='inner',on='item_id')
items_transactions.head()

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
0,Ava,1011,4/1/23,Paint,$15.99,Arts & Crafts,3.5
1,Bennett,1011,4/25/23,Paint,$15.99,Arts & Crafts,3.5
2,Gavin,1011,4/6/23,Paint,$15.99,Arts & Crafts,3.5
3,Jenny,1011,4/20/23,Paint,$15.99,Arts & Crafts,3.5
4,Ava,1014,4/1/23,Brush,$1.99,Arts & Crafts,4.2


In [87]:
# Performing a right merge
items_transactions = transactions.merge(items,how='right',on='item_id')
items_transactions.head()

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
0,Ava,1011,4/1/23,Paint,$15.99,Arts & Crafts,3.5
1,Bennett,1011,4/25/23,Paint,$15.99,Arts & Crafts,3.5
2,Gavin,1011,4/6/23,Paint,$15.99,Arts & Crafts,3.5
3,Jenny,1011,4/20/23,Paint,$15.99,Arts & Crafts,3.5
4,Chloe,1012,5/2/23,Crayons,$2.87,Arts & Crafts,4.7


In [88]:
# Performing an outer merge
df = transactions.merge(items,how='outer',on='item_id')
df.head()

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
0,Ava,1011,4/1/23,Paint,$15.99,Arts & Crafts,3.5
1,Bennett,1011,4/25/23,Paint,$15.99,Arts & Crafts,3.5
2,Gavin,1011,4/6/23,Paint,$15.99,Arts & Crafts,3.5
3,Jenny,1011,4/20/23,Paint,$15.99,Arts & Crafts,3.5
4,Ava,1014,4/1/23,Brush,$1.99,Arts & Crafts,4.2


### Preparing rows for modeling

### goal: predict which customers are most likely to buy dog food in june
### each row should contain data for one customer
### y: june dog food purchases
### x: aggregation based on april and may data

In [89]:
# Checking the data types
df.dtypes

customer            object
item_id              int64
purchase_date       object
item_description    object
price               object
category            object
rating              object
dtype: object

In [90]:
# converting the objects into the correct data types
df.purchase_date = pd.to_datetime(df.purchase_date) 
df.price = pd.to_numeric(df.price.str.replace('$',''))
df.rating = pd.to_numeric(df.rating)

print('Data converted successfully')

Data converted successfully


  df.price = pd.to_numeric(df.price.str.replace('$',''))


In [91]:
# confirming the data types
df.dtypes

customer                    object
item_id                      int64
purchase_date       datetime64[ns]
item_description            object
price                      float64
category                    object
rating                     float64
dtype: object

In [92]:
# creating a subset of april and may data
april_may = df[df.purchase_date.dt.month < 6].reset_index(drop=True)
april_may

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
0,Ava,1011,2023-04-01,Paint,15.99,Arts & Crafts,3.5
1,Bennett,1011,2023-04-25,Paint,15.99,Arts & Crafts,3.5
2,Gavin,1011,2023-04-06,Paint,15.99,Arts & Crafts,3.5
3,Jenny,1011,2023-04-20,Paint,15.99,Arts & Crafts,3.5
4,Ava,1014,2023-04-01,Brush,1.99,Arts & Crafts,4.2
...,...,...,...,...,...,...,...
88,Lia,3875,2023-04-28,Dress,24.48,Apparel,4.1
89,Lia,3875,2023-05-20,Dress,24.48,Apparel,4.1
90,Lily,2342,2023-04-12,Dog Toy,9.99,Pet Supplies,3.4
91,Lily,2342,2023-05-25,Dog Toy,9.99,Pet Supplies,3.4


In [93]:
# creating a subset of june data
june = df[df.purchase_date.dt.month == 6].reset_index()
june

Unnamed: 0,index,customer,item_id,purchase_date,item_description,price,category,rating
0,19,Ben,2345,2023-06-15,Dog Food,29.99,Pet Supplies,4.9
1,20,Chloe,2345,2023-06-06,Dog Food,29.99,Pet Supplies,4.9
2,22,Olivia,2345,2023-06-01,Dog Food,29.99,Pet Supplies,4.9
3,27,Aiden,2345,2023-06-04,Dog Food,29.99,Pet Supplies,4.9
4,29,Lia,2345,2023-06-20,Dog Food,29.99,Pet Supplies,4.9
5,31,Calvin,2345,2023-06-16,Dog Food,29.99,Pet Supplies,4.9
6,33,Lily,2345,2023-06-18,Dog Food,29.99,Pet Supplies,4.9
7,38,Isabel,3811,2023-06-15,Socks,7.5,Apparel,3.7
8,40,Margaret,3811,2023-06-17,Socks,7.5,Apparel,3.7
9,43,Henry,3811,2023-06-17,Socks,7.5,Apparel,3.7


In [94]:
# Getting dog food item description and save it as june food
juneFood = june[june.item_description == 'Dog Food']
juneFood

Unnamed: 0,index,customer,item_id,purchase_date,item_description,price,category,rating
0,19,Ben,2345,2023-06-15,Dog Food,29.99,Pet Supplies,4.9
1,20,Chloe,2345,2023-06-06,Dog Food,29.99,Pet Supplies,4.9
2,22,Olivia,2345,2023-06-01,Dog Food,29.99,Pet Supplies,4.9
3,27,Aiden,2345,2023-06-04,Dog Food,29.99,Pet Supplies,4.9
4,29,Lia,2345,2023-06-20,Dog Food,29.99,Pet Supplies,4.9
5,31,Calvin,2345,2023-06-16,Dog Food,29.99,Pet Supplies,4.9
6,33,Lily,2345,2023-06-18,Dog Food,29.99,Pet Supplies,4.9


In [95]:
# creating a column of june dog food purchase
juneFoodPurchase = juneFood.groupby('customer')['item_id'].count()
juneFoodPurchase

customer
Aiden     1
Ben       1
Calvin    1
Chloe     1
Lia       1
Lily      1
Olivia    1
Name: item_id, dtype: int64

In [96]:
# Changing the column name to juneFoodPurchase from item_id
juneFoodPurchase = juneFood.groupby('customer')['item_id'].count().rename('juneFoodPurchase')
juneFoodPurchase

customer
Aiden     1
Ben       1
Calvin    1
Chloe     1
Lia       1
Lily      1
Olivia    1
Name: juneFoodPurchase, dtype: int64

In [97]:
april_may.head(5)

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
0,Ava,1011,2023-04-01,Paint,15.99,Arts & Crafts,3.5
1,Bennett,1011,2023-04-25,Paint,15.99,Arts & Crafts,3.5
2,Gavin,1011,2023-04-06,Paint,15.99,Arts & Crafts,3.5
3,Jenny,1011,2023-04-20,Paint,15.99,Arts & Crafts,3.5
4,Ava,1014,2023-04-01,Brush,1.99,Arts & Crafts,4.2


In [98]:
# Checking how much each customer spent in april and may
total_spend = april_may.groupby('customer')['price'].sum()
total_spend

customer
Aiden       222.16
Ava          43.97
Ben          44.19
Bennett      27.73
Blake        25.55
Calvin       29.99
Chloe        36.33
Daniel       17.46
Evelyn       66.19
Gavin        39.47
Henry       112.42
Isabel        2.79
Jenny        49.34
Kate         83.25
Lia          78.95
Lily         69.31
Madeline    122.63
Margaret      7.99
Maxwell      78.31
Nolan        67.51
Olivia       68.03
Sophie        2.57
Name: price, dtype: float64

In [99]:
# Renaming the column to total spend from price
total_spend.rename('total_spend')

customer
Aiden       222.16
Ava          43.97
Ben          44.19
Bennett      27.73
Blake        25.55
Calvin       29.99
Chloe        36.33
Daniel       17.46
Evelyn       66.19
Gavin        39.47
Henry       112.42
Isabel        2.79
Jenny        49.34
Kate         83.25
Lia          78.95
Lily         69.31
Madeline    122.63
Margaret      7.99
Maxwell      78.31
Nolan        67.51
Olivia       68.03
Sophie        2.57
Name: total_spend, dtype: float64

In [100]:
# Each row represents a customer
model_df = pd.concat([juneFoodPurchase,total_spend],axis=1)
model_df

Unnamed: 0_level_0,juneFoodPurchase,price
customer,Unnamed: 1_level_1,Unnamed: 2_level_1
Aiden,1.0,222.16
Ben,1.0,44.19
Calvin,1.0,29.99
Chloe,1.0,36.33
Lia,1.0,78.95
Lily,1.0,69.31
Olivia,1.0,68.03
Ava,,43.97
Bennett,,27.73
Blake,,25.55


In [101]:
# Filling the missing values with zero
model_df.fillna(0).reset_index()

Unnamed: 0,customer,juneFoodPurchase,price
0,Aiden,1.0,222.16
1,Ben,1.0,44.19
2,Calvin,1.0,29.99
3,Chloe,1.0,36.33
4,Lia,1.0,78.95
5,Lily,1.0,69.31
6,Olivia,1.0,68.03
7,Ava,0.0,43.97
8,Bennett,0.0,27.73
9,Blake,0.0,25.55


## Preparing columns for modeling

In [102]:
model_df.head(5)

Unnamed: 0_level_0,juneFoodPurchase,price
customer,Unnamed: 1_level_1,Unnamed: 2_level_1
Aiden,1.0,222.16
Ben,1.0,44.19
Calvin,1.0,29.99
Chloe,1.0,36.33
Lia,1.0,78.95


In [103]:
# changing the category column into numeric
april_may.head()

Unnamed: 0,customer,item_id,purchase_date,item_description,price,category,rating
0,Ava,1011,2023-04-01,Paint,15.99,Arts & Crafts,3.5
1,Bennett,1011,2023-04-25,Paint,15.99,Arts & Crafts,3.5
2,Gavin,1011,2023-04-06,Paint,15.99,Arts & Crafts,3.5
3,Jenny,1011,2023-04-20,Paint,15.99,Arts & Crafts,3.5
4,Ava,1014,2023-04-01,Brush,1.99,Arts & Crafts,4.2


In [104]:
aprilMay_dummies = pd.get_dummies(april_may.category)
aprilMay_dummies

Unnamed: 0,Apparel,Arts & Crafts,Games & Toys,Personal Care,Pet Supplies
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
88,1,0,0,0,0
89,1,0,0,0,0
90,0,0,0,0,1
91,0,0,0,0,1


In [105]:
# Combinig the tables customer data
pd.concat([april_may.customer,aprilMay_dummies],axis=1)

Unnamed: 0,customer,Apparel,Arts & Crafts,Games & Toys,Personal Care,Pet Supplies
0,Ava,0,1,0,0,0
1,Bennett,0,1,0,0,0
2,Gavin,0,1,0,0,0
3,Jenny,0,1,0,0,0
4,Ava,0,1,0,0,0
...,...,...,...,...,...,...
88,Lia,1,0,0,0,0
89,Lia,1,0,0,0,0
90,Lily,0,0,0,0,1
91,Lily,0,0,0,0,1


In [106]:
# Grouping the combined table using customers
customers = pd.concat([april_may.customer,aprilMay_dummies],axis=1).groupby('customer').sum().reset_index()
customers.sample(5)

Unnamed: 0,customer,Apparel,Arts & Crafts,Games & Toys,Personal Care,Pet Supplies
8,Evelyn,6,0,0,0,0
6,Chloe,3,2,0,0,0
16,Madeline,6,1,0,0,0
18,Maxwell,1,0,3,0,0
12,Jenny,0,6,0,0,0


In [112]:
# last purchase in the month of april may
last_purchase = april_may.groupby('customer')['purchase_date'].max()
last_purchase

customer
Aiden      2023-05-19
Ava        2023-05-01
Ben        2023-04-20
Bennett    2023-04-25
Blake      2023-05-10
Calvin     2023-05-16
Chloe      2023-05-04
Daniel     2023-04-05
Evelyn     2023-05-15
Gavin      2023-05-04
Henry      2023-05-11
Isabel     2023-04-23
Jenny      2023-04-20
Kate       2023-05-07
Lia        2023-05-20
Lily       2023-05-25
Madeline   2023-05-24
Margaret   2023-04-03
Maxwell    2023-05-15
Nolan      2023-05-18
Olivia     2023-05-20
Sophie     2023-05-22
Name: purchase_date, dtype: datetime64[ns]

In [114]:
# Noting a days date
today = pd.Series(pd.to_datetime('2023-06-01'),index=last_purchase.index)
today

customer
Aiden      2023-06-01
Ava        2023-06-01
Ben        2023-06-01
Bennett    2023-06-01
Blake      2023-06-01
Calvin     2023-06-01
Chloe      2023-06-01
Daniel     2023-06-01
Evelyn     2023-06-01
Gavin      2023-06-01
Henry      2023-06-01
Isabel     2023-06-01
Jenny      2023-06-01
Kate       2023-06-01
Lia        2023-06-01
Lily       2023-06-01
Madeline   2023-06-01
Margaret   2023-06-01
Maxwell    2023-06-01
Nolan      2023-06-01
Olivia     2023-06-01
Sophie     2023-06-01
dtype: datetime64[ns]

In [115]:
# Finding the number of days between
days_btwn = (today - last_purchase).dt.days.rename('days between')
days_btwn

customer
Aiden       13
Ava         31
Ben         42
Bennett     37
Blake       22
Calvin      16
Chloe       28
Daniel      57
Evelyn      17
Gavin       28
Henry       21
Isabel      39
Jenny       42
Kate        25
Lia         12
Lily         7
Madeline     8
Margaret    59
Maxwell     17
Nolan       14
Olivia      12
Sophie      10
Name: days between, dtype: int64

In [116]:
# adding the column to the model_df
days_btwn.reset_index()

Unnamed: 0,customer,days between
0,Aiden,13
1,Ava,31
2,Ben,42
3,Bennett,37
4,Blake,22
5,Calvin,16
6,Chloe,28
7,Daniel,57
8,Evelyn,17
9,Gavin,28


In [119]:
model_df = model_df.merge(days_btwn,how='left',on='customer')
model_df.loc[:,~model_df.columns.duplicated()]

Unnamed: 0,customer,juneFoodPurchase,price,Apparel_x,Arts & Crafts_x,Games & Toys_x,Personal Care_x,Pet Supplies_x,Apparel_y,Arts & Crafts_y,Games & Toys_y,Personal Care_y,Pet Supplies_y,days between_x,days between_y,days between
0,Aiden,1.0,222.16,0,0,0,0,8,0,0,0,0,8,13,13,13
1,Ben,1.0,44.19,0,0,0,0,2,0,0,0,0,2,42,42,42
2,Calvin,1.0,29.99,0,0,0,0,1,0,0,0,0,1,16,16,16
3,Chloe,1.0,36.33,3,2,0,0,0,3,2,0,0,0,28,28,28
4,Lia,1.0,78.95,2,0,0,0,1,2,0,0,0,1,12,12,12
5,Lily,1.0,69.31,0,0,2,0,4,0,0,2,0,4,7,7,7
6,Olivia,1.0,68.03,1,1,0,0,2,1,1,0,0,2,12,12,12
7,Ava,,43.97,0,4,0,0,0,0,4,0,0,0,31,31,31
8,Bennett,,27.73,0,5,0,0,0,0,5,0,0,0,37,37,37
9,Blake,,25.55,0,0,0,0,1,0,0,0,0,1,22,22,22


In [123]:
model_df.shape

(22, 14)

In [127]:
# Removing duplicates
dups = ['Apparel_x','Arts & Crafts_x','Games & Toys_x','Personal Care_x','Pet Supplies_x']
model_df.drop(columns=dups,inplace=True)

In [129]:
# Merging with customers data
model_df = model_df.merge(customers, how='left',on ='customer')
model_df.head()

Unnamed: 0,customer,juneFoodPurchase,price,days between,Apparel,Arts & Crafts,Games & Toys,Personal Care,Pet Supplies
0,Aiden,1.0,222.16,13,0,0,0,0,8
1,Ben,1.0,44.19,42,0,0,0,0,2
2,Calvin,1.0,29.99,16,0,0,0,0,1
3,Chloe,1.0,36.33,28,3,2,0,0,0
4,Lia,1.0,78.95,12,2,0,0,0,1
