# Lego Ebay Project

In [22]:
import pandas as pd
import numpy as np
import sqlite3 as sql
import matplotlib.pyplot as plt
%matplotlib inline

# **********   INJEST   **********
Create a connection to the database and save SQL queries as variables

In [23]:
# Create connection to database
database = "C:\\Users\\zubaz\\Documents\\Python\\EbayLegoWebscrape\\lego.db"
connection = sql.connect(database)

In [24]:
# 2 queries for 2 tables, one with price data, one with set metadata
query1 = '''SELECT set_num, date, price
            FROM ebay_prices'''
query2 = '''SELECT * FROM set_details'''

### Create dataframe for query1
This dataframe contains the ebay prices data

In [25]:
df = pd.read_sql_query(query1, connection)
df.head()

Unnamed: 0,set_num,date,price
0,10256,02-01-2022,20.0
1,70147,01-07-2022,158.99
2,70222,01-07-2022,119.99
3,70223,01-07-2022,184.98
4,41052,01-07-2022,185.98


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64258 entries, 0 to 64257
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   set_num  64258 non-null  object
 1   date     64258 non-null  object
 2   price    64258 non-null  object
dtypes: object(3)
memory usage: 1.5+ MB


In [27]:
# find how many unique set_num there are
num_of_sets = len(pd.unique(df['set_num']))
print(f"There is data for {num_of_sets} Lego sets")

There is data for 1708 Lego sets


### Create a dataframe for query2
This dataframe contains the set dimension data

In [28]:
df_set = pd.read_sql_query(query2, connection)
# df_set.head()

In [29]:
#change date column from text to datetime
df['date'] = pd.to_datetime(df['date'])

In [30]:
# find earliest and latest dates and how many days of data exists
oldest_date = df['date'].min()
recent_date = df['date'].max()
date_difference = recent_date - oldest_date
print(oldest_date)
print(recent_date)
print(date_difference)

2022-01-07 00:00:00
2022-02-17 00:00:00
41 days 00:00:00


# **********  EXPLORE & CLEAN  **********

## Price Dataframe
Examine the 'price' column and do some cleaning

In [31]:
# Check if all values in price column are integers
all(x.is_integer() for x in df['price'])

False

In [32]:
print(df[pd.to_numeric(df['price'], errors='coerce').isnull()])

      set_num       date            price
29      75523 2022-01-19  25.79 to 135.81
41      10235 2022-02-06     3.29 to 4.98
45      10695 2022-01-21  16.37 to 122.89
104     21316 2022-01-11    9.89 to 13.89
118     70321 2022-02-02  20.25 to 282.98
...       ...        ...              ...
63490   75827 2022-01-07         1,168.39
63742   75192 2022-02-10         1,168.83
63787   10251 2022-01-22         1,286.32
63851   21137 2022-01-22         1,749.99
64017   10251 2022-02-06         1,081.17

[483 rows x 3 columns]


In [33]:
# looks like the commas are bad, let's replace commas with nothing
df['price'] = df['price'].replace(',','', regex=True)

In [34]:
# check to see what the rest of the non numeric values look like
# print(df['price'] [pd.to_numeric(df['price'], errors='coerce').isnull()])

In [35]:
df.shape

(64258, 3)

In [36]:
# find the index of rows with 'to' in them
# these values are too hard to deal with, probably not representative listings
remove_rows = df[df['price'].str.contains("to") == True].index

print(remove_rows)

Int64Index([   29,    41,    45,   104,   118,  2787,  2838,  3108, 10432,
            10437, 10536, 10537, 10538, 10539, 10540, 10541, 13288, 13289,
            13290, 13291, 16082, 17027, 23569, 23580, 25771, 25878, 28784,
            32054, 32740, 34813, 37714, 40537, 44719, 46594, 46599, 48704,
            48744, 48751, 48761, 48772, 50257, 50564, 50619, 50622, 50624,
            50626, 50628, 50629, 50658, 50716, 50717, 50718, 50719, 50720,
            50721, 50722, 50723, 50728, 50729, 50732, 50733, 50734, 50735,
            50736, 50737, 50738, 50739, 50740, 50742, 50760, 50773, 52964,
            57801, 57803, 57804, 58099, 59667],
           dtype='int64')


In [37]:
# remove these rows
df.drop(remove_rows, inplace=True)

In [38]:
df.shape

(64181, 3)

In [39]:
print(df[pd.to_numeric(df['price'], errors='coerce').isnull()])

Empty DataFrame
Columns: [set_num, date, price]
Index: []


In [40]:
# checks price column to make sure all rows are numeric
pd.to_numeric(df['price'], errors='coerce').notnull().all()

True

In [41]:
df.dtypes

set_num            object
date       datetime64[ns]
price              object
dtype: object

In [42]:
# now that all rows in price column are numeric, change column data type to numeric
# pandas will pick int64 if there are no decimals, float 64 if decimals are present, which there are
df['price'] = pd.to_numeric(df['price'])
df.dtypes

set_num            object
date       datetime64[ns]
price             float64
dtype: object

## Metadata Dataframe

In [43]:
# time to clean the set_details data
df_set.head()

Unnamed: 0,set_num,set_name,theme_group,theme,subtheme,year_released,launch_exit,pieces,minifigs,designer,msrp,age_range,packaging,availability,rating
0,60009-1,Helicopter Arrest,Modern day,City,Police,2013,,352.0,5,Henrik Andersen,£39.99 / $49.99,5 - 12,Box,Retail - limited,✭✭✭✭✩ 3.8 52 ratings
1,60012-1,Coast Guard 4x4 & Diving Boat,Modern day,City,Coast Guard,2013,01 Jun 2013 - 31 Jul 2015,128.0,2 (2 Unique to this set),,£9.99 / $19.99 / 12.99€,5 - 12,Box,Retail,✭✭✭✭✩ 3.8 145 ratings 4 Reviews Official Brick...
2,75975-1,Watchpoint: Gibraltar,Licensed,Overwatch,,2019,01 Jan 2019 - 31 Jul 2020,730.0,4 (3 Unique to this set),Mark Stafford,£79.99 / $89.99 / 87.72€,9+,Box,Retail,✭✭✭✭✩ 3.9 114 ratings 1 Review Official Bricks...
3,10155-1,Maersk Line Container Ship,Model making,Creator Expert,Maersk,2010,01 Aug 2010 - 31 Jul 2011,990.0,,,£102.99 / $119.99,8+,Box,LEGO exclusive,✭✭✭✭✩ 4.1 28 ratings 3 Reviews
4,10210-1,Imperial Flagship,Model making,Creator Expert,Miscellaneous,2010,01 Jan 2010 - 31 Dec 2011,1664.0,9 (6 Unique to this set),Raphael Pretesacque,£142.99 / $179.99,14+,Box,Retail - limited,✭✭✭✭✭ 4.6 171 ratings 16 Reviews


In [44]:
# remove -1 from set_num
df_set['set_num'] = df_set['set_num'].str.split('-', n=1).str[0]

In [45]:
# split launch_exit column into 2 columns
df_set[['launch_date', 'retirement_date']] = df_set['launch_exit'].str.split(' - ', expand=True)

In [46]:
df_set.drop('launch_exit', axis=1, inplace=True)

In [47]:
# need to plit the minifigs column into total and unique
df_set[['minifigs_total', 'minifigs_unique']] = df_set['minifigs'].str.split(' ', n=1, expand=True)

In [48]:
# parse the minifig value from minifigs_unique
df_set['minifigs_unique'] = df_set['minifigs_unique'].str.split(' ', n=1).str[0].str.replace('(', '', regex=True)

In [49]:
df_set.drop('minifigs', axis=1, inplace=True)

In [50]:
# parse rating column to get the text after the stars, this grabs non numeric values for rows with no rating
df_set['rating'] = df_set['rating'].str.split(' ', n=2).str[1].str.strip()

In [51]:
# a good way to replace non-numeric rows is to_numeric method, must use coerce to force NaN values for non-numerics
df_set['rating'] = pd.to_numeric(df_set['rating'], errors='coerce')

In [52]:
# change launch retirement date columns to date
# first need to remove spaces
df_set['launch_date'] = df_set['launch_date'].str.replace(' ', '')
df_set['launch_date'] = pd.to_datetime(df_set['launch_date'], format='%d%b%Y', errors='coerce')

In [53]:
# had some t.b.a text for some rows, they threw an error, added errors=coerce and seemed to fix it
df_set['retirement_date'] = df_set['retirement_date'].str.replace(' ', '')
df_set['retirement_date'] = pd.to_datetime(df_set['retirement_date'], format='%d%b%Y', errors='coerce')

In [54]:
# clean up msrp column and grab USD values only - drop pounds and euro values if present
# use regex to extract everything after the $, the dot, and the remaining digits
# REGEX sucks. But remember to use regex101.com, it's a life saver
df_set['msrp'] = df_set['msrp'].str.extract(r"\$(\d+\.\d+)")

In [55]:
# if launch_date is empty add the value from year released date column, 
df_set['launch_date'] = df_set['launch_date'].fillna(df_set['year_released'])
# very cool, it added jan 1 to the year automatically.

In [56]:
# change some datatypes
df_set['year_released'] = df_set['year_released'].astype(int)
df_set['msrp'] = df_set['msrp'].astype(float)
df_set['minifigs_total'] = df_set['minifigs_total'].astype(float)
df_set['minifigs_unique'] = df_set['minifigs_unique'].astype(float)
df_set['set_num'] = df_set['set_num'].astype(int)

In [57]:
df_set.dtypes

set_num                     int32
set_name                   object
theme_group                object
theme                      object
subtheme                   object
year_released               int32
pieces                    float64
designer                   object
msrp                      float64
age_range                  object
packaging                  object
availability               object
rating                    float64
launch_date        datetime64[ns]
retirement_date    datetime64[ns]
minifigs_total            float64
minifigs_unique           float64
dtype: object

In [58]:
# set the index to set_num, which should be unique
df_set.set_index('set_num')

Unnamed: 0_level_0,set_name,theme_group,theme,subtheme,year_released,pieces,designer,msrp,age_range,packaging,availability,rating,launch_date,retirement_date,minifigs_total,minifigs_unique
set_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
60009,Helicopter Arrest,Modern day,City,Police,2013,352.0,Henrik Andersen,49.99,5 - 12,Box,Retail - limited,3.8,2013-01-01,NaT,5.0,
60012,Coast Guard 4x4 & Diving Boat,Modern day,City,Coast Guard,2013,128.0,,19.99,5 - 12,Box,Retail,3.8,2013-06-01,2015-07-31,2.0,2.0
75975,Watchpoint: Gibraltar,Licensed,Overwatch,,2019,730.0,Mark Stafford,89.99,9+,Box,Retail,3.9,2019-01-01,2020-07-31,4.0,3.0
10155,Maersk Line Container Ship,Model making,Creator Expert,Maersk,2010,990.0,,119.99,8+,Box,LEGO exclusive,4.1,2010-08-01,2011-07-31,,
10210,Imperial Flagship,Model making,Creator Expert,Miscellaneous,2010,1664.0,Raphael Pretesacque,179.99,14+,Box,Retail - limited,4.6,2010-01-01,2011-12-31,9.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,Palpatine's Arrest,Licensed,Star Wars,Episode III,2012,645.0,,89.99,9 - 14,Box,Retail,4.4,2012-08-01,2013-12-31,6.0,6.0
9585,WeDo Resource Set,Educational,Education,,2011,326.0,,49.95,7+,Tub,Educational,,2011-01-01,NaT,,
9594,Green City Challenge Set,Educational,Education,Mindstorms,2011,1365.0,,174.95,,,,,2011-01-01,NaT,1.0,1.0
9688,Renewable Energy Add-On Set,Educational,Education,,2010,12.0,,99.95,8+,,,,2010-08-01,2016-12-31,,


In [59]:
# check for all unique rows in set_num
# is_unique method only works for a series, create that first and then check for uniqueness
set_num_series = df_set['set_num'].squeeze()
set_num_series.is_unique

True

Loop through a list of sets and calculate and remove outliers, then groupby and average daily price
Create a new data frame that we will use for the rest of the analysis that is now at a daily granularity

In [21]:
# create function to remove outliers using IQR method
def remove_outliers(dataframe):
    Q1 = dataframe['price'].quantile(0.25)
    Q3 = dataframe['price'].quantile(0.75)
    IQR = Q3 - Q1
    lower_lim = Q1 - 1.5*IQR
    upper_lim = Q3 + 1.5*IQR
    outliers_15_low = ( dataframe['price'] < lower_lim)
    outliers_15_high = ( dataframe['price'] > upper_lim)
    df_outliers_removed = dataframe[~(outliers_15_low | outliers_15_high)]
    return df_outliers_removed

In [None]:
# Test list of sets to loop through
test_list = [75827, 75192, 70222, 70223]
# create an empty data frame to append each looped df to
looped_df = pd.DataFrame()
for set in test_list:
    filter = (df['set_num'] == set)
    df_filtered = df[filter]
    df_no_outliers = remove_outliers(df_filtered)
    # reset index fills down the set_num column, creates a full dataframe
    df_filtered_grouped = df_no_outliers.groupby(['set_num', 'date']).mean().reset_index()
    looped_df = looped_df.append(df_filtered_grouped)


In [None]:
print(looped_df)

In [None]:
# Group the price data by date and calculate the mean
# i.e. create a new dataframe with one date per row and the average sales price
dffilt_group = df_no_outliers.groupby(['date']).mean()
dffilt_group

In [None]:
plt.rc('font', size=12)
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(dffilt_group.index, dffilt_group.price, label='Average Daily Price')

fig.autofmt_xdate()
ax.set_xlabel('Date')
ax.set_ylabel('Price')
ax.set_title('Average Daily Price of Ghostbusters Set')
ax.grid(True);

# **********  MERGE  **********

In [None]:
# check data types before the merge
df_set.dtypes

In [None]:
looped_df.dtypes

In [None]:
# I want all values in the price df and mrsp from set data
# this is a left join and we use pandas merge method
left_join = pd.merge(looped_df,
                    df_set[['set_num', 'msrp']],
                    on = 'set_num',
                    how = 'left' 
)
left_join.head()

In [None]:
# create calculated column of $ of appreciation for every row
left_join['appreciation'] = (left_join['price'] - left_join['msrp']).round(2)

In [None]:
# create % appreciation calculated column
left_join['pct_appreciation'] = (left_join['appreciation'] / left_join['msrp'] * 100).round()

In [None]:
# this doesn't work, need to add some groupby or do this calc in the for loop
left_join['SMA30_pct_app'] = left_join['pct_appreciation'].rolling(15).mean()

In [None]:
left_join