# Lego Ebay Project
## Data analysis and cleaning

In [28]:
import pandas as pd
import numpy as np
import sqlite3 as sql
import matplotlib.pyplot as plt
%matplotlib inline
from helper import removeOutliers

ImportError: cannot import name 'removeOutliers' from 'helper' (c:\Users\zubaz\Documents\Python\EbayLegoWebscrape\Data Analysis\helper.py)

# **********   Injest   **********

## Create a connection to the database and save SQL queries as variables

In [2]:
# Create connection to database
database = "C:\\Users\\zubaz\\Documents\\Python\\EbayLegoWebscrape\\lego.db"
connection = sql.connect(database)

In [3]:
query1 = '''SELECT set_num, date, price
            FROM ebay_prices'''
query2 = '''SELECT * FROM set_details'''

## Create dataframe for query1
### This dataframe contains the ebay prices data

In [4]:
df = pd.read_sql_query(query1, connection)
df.head()

Unnamed: 0,set_num,date,price
0,10256,02-01-2022,20.0
1,70147,01-07-2022,158.99
2,70222,01-07-2022,119.99
3,70223,01-07-2022,184.98
4,41052,01-07-2022,185.98


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57876 entries, 0 to 57875
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   set_num  57876 non-null  object
 1   date     57876 non-null  object
 2   price    57876 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


## Create a dataframe for query2
### This dataframe contains the set dimension data

In [6]:
df_set = pd.read_sql_query(query2, connection)
df_set.head()

Unnamed: 0,set_num,set_name,theme_group,theme,subtheme,year_released,launch_exit,pieces,minifigs,designer,msrp,age_range,packaging,availability,rating
0,60009-1,Helicopter Arrest,Modern day,City,Police,2013,,352.0,5,Henrik Andersen,£39.99 / $49.99,5 - 12,Box,Retail - limited,✭✭✭✭✩ 3.8 52 ratings
1,60012-1,Coast Guard 4x4 & Diving Boat,Modern day,City,Coast Guard,2013,01 Jun 2013 - 31 Jul 2015,128.0,2 (2 Unique to this set),,£9.99 / $19.99 / 12.99€,5 - 12,Box,Retail,✭✭✭✭✩ 3.8 145 ratings 4 Reviews Official Brick...
2,75975-1,Watchpoint: Gibraltar,Licensed,Overwatch,,2019,01 Jan 2019 - 31 Jul 2020,730.0,4 (3 Unique to this set),Mark Stafford,£79.99 / $89.99 / 87.72€,9+,Box,Retail,✭✭✭✭✩ 3.9 114 ratings 1 Review Official Bricks...
3,10155-1,Maersk Line Container Ship,Model making,Creator Expert,Maersk,2010,01 Aug 2010 - 31 Jul 2011,990.0,,,£102.99 / $119.99,8+,Box,LEGO exclusive,✭✭✭✭✩ 4.1 28 ratings 3 Reviews
4,10210-1,Imperial Flagship,Model making,Creator Expert,Miscellaneous,2010,01 Jan 2010 - 31 Dec 2011,1664.0,9 (6 Unique to this set),Raphael Pretesacque,£142.99 / $179.99,14+,Box,Retail - limited,✭✭✭✭✭ 4.6 171 ratings 16 Reviews


In [7]:
#change date column from text to datetime
df['date'] = pd.to_datetime(df['date'])

In [8]:
# find earliest and latest dates and how many days of data exists
oldest_date = df['date'].min()
recent_date = df['date'].max()
date_difference = recent_date - oldest_date
print(oldest_date)
print(recent_date)
print(date_difference)

2022-01-07 00:00:00
2022-02-10 00:00:00
34 days 00:00:00


## Examine the 'price' column and do some cleaning

In [9]:
# Check if all values in price column are intergers
all(x.is_integer() for x in df['price'])

False

In [10]:
print(df[pd.to_numeric(df['price'], errors='coerce').isnull()])

      set_num       date            price
26      75523 2022-01-19  25.79 to 135.81
38      10235 2022-02-06     3.29 to 4.98
41      10695 2022-01-21  16.37 to 122.89
96      21316 2022-01-11    9.89 to 13.89
109     70321 2022-02-02  20.25 to 282.98
...       ...        ...              ...
57209   75827 2022-01-07         1,168.39
57456   75192 2022-02-10         1,168.83
57501   10251 2022-01-22         1,286.32
57565   21137 2022-01-22         1,749.99
57726   10251 2022-02-06         1,081.17

[411 rows x 3 columns]


In [11]:
# looks like the commas are bad, let's replace commas with nothing
df['price'] = df['price'].replace(',','', regex=True)

In [12]:
# check to see what the rest of the non numeric values look like
# print(df['price'] [pd.to_numeric(df['price'], errors='coerce').isnull()])

In [13]:
df.shape

(57876, 3)

In [14]:
# find the index of rows with 'to' in them
# these values are too hard to deal with, probably not representative listings
remove_rows = df[df['price'].str.contains("to") == True].index

print(remove_rows)

Int64Index([   26,    38,    41,    96,   109,  2553,  2858,  9561, 12170,
            14668, 23364, 29125, 29793, 41994, 41999, 43914, 43925, 45377,
            45593, 45597, 45599, 45601, 45602, 45628, 45685, 45686, 45687,
            45688, 45689, 45690, 45691, 45692, 45696, 45697, 45700, 45701,
            45702, 45703, 45704, 45705, 45706, 45707, 45708, 45710, 45728,
            45740, 52384],
           dtype='int64')


In [15]:
# remove these rows
df.drop(remove_rows, inplace=True)

In [16]:
df.shape

(57829, 3)

In [17]:
print(df[pd.to_numeric(df['price'], errors='coerce').isnull()])

Empty DataFrame
Columns: [set_num, date, price]
Index: []


In [18]:
# checks price column to make sure all rows are numeric
pd.to_numeric(df['price'], errors='coerce').notnull().all()

True

In [19]:
df.dtypes

set_num            object
date       datetime64[ns]
price              object
dtype: object

In [20]:
# now that all rows in price column are numeric, change column data type to numeric
# pandas will pick int64 if there are no decimals, float 64 if decimals are present, which there are
df['price'] = pd.to_numeric(df['price'])
df.dtypes

set_num            object
date       datetime64[ns]
price             float64
dtype: object

In [None]:
def remove_outliers(dataframe):
    Q1 = dataframe['price'].quantile(0.25)
    Q3 = dataframe['price'].quantile(0.75)
    IQR = Q3 - Q1
    lower_lim = Q1 - 1.5*IQR
    upper_lim = Q3 + 1.5*IQR
    outliers_15_low = ( dataframe['price'] < lower_lim)
    outliers_15_high = ( dataframe['price'] > upper_lim)
    df_outliers_removed = dataframe[~(outliers_15_low | outliers_15_high)]
    return df_outliers_removed

In [24]:
# returns a boolean series of just set num 75827
filt = (df['set_num'] == 75827)
filt

0        False
1        False
2        False
3        False
4        False
         ...  
57871    False
57872    False
57873    False
57874    False
57875    False
Name: set_num, Length: 57829, dtype: bool

In [25]:
# returns a data frame that has had filt boolean mask applied
dffilt = df[filt].sort_values(by='price')
dffilt

Unnamed: 0,set_num,date,price
40132,75827,2022-01-14,2.75
11111,75827,2022-01-11,10.0
33415,75827,2022-01-14,274.68
21097,75827,2022-01-29,389.01
50466,75827,2022-01-07,789.99
55285,75827,2022-01-07,798.0
53095,75827,2022-01-18,799.0
19896,75827,2022-01-07,799.0
47334,75827,2022-01-26,800.0
49922,75827,2022-01-07,839.0


In [27]:
# apply remove_outliers function to dffilt dataframe
# assign to new df
df_no_outliers = helper.remove_outliers(dffilt)
df_no_outliers

AttributeError: module 'helper' has no attribute 'remove_outliers'

In [None]:
filt = (df['set_num'] == 75827)
dffilt = df[filt].sort_values(by='price')
remove_outliers(dffilt)

In [None]:
# lets take a look at just the ghostbusters set data
filt = (df['set_num'] == 75827)
dffilt = df[filt].sort_values(by='price')
dffilt['price'].describe()


In [None]:
dffilt.boxplot(column=['price'])
# looks like we've got some outlier prices

In [None]:
# This is Tukey's rule, known as the IQR rule
# IQR = Inter Quartile Range
Q1 = dffilt['price'].quantile(0.25)
Q3 = dffilt['price'].quantile(0.75)
IQR = Q3 - Q1
IQR

In [None]:
# We will use 1.5*IQR for our limits, you can also use 2.5 or 2.0
lower_lim = Q1 - 1.5*IQR
upper_lim = Q3 + 1.5*IQR
lower_lim

In [None]:
upper_lim

In [None]:
# how many rows below our low limit?
outliers_15_low = ( dffilt['price'] < lower_lim)
len(dffilt['price'][outliers_15_low])

In [None]:
# how many rows above high limit?
outliers_15_high = ( dffilt['price'] > upper_lim)
len(dffilt['price'][outliers_15_high])

In [None]:
# the tilda ~ reverses the filtered indexes
dffilt = dffilt[~(outliers_15_low | outliers_15_high)]


In [None]:
dffilt

In [None]:
dffilt.boxplot(column='price')

In [None]:
# Group the price data by date and calculate the mean
# i.e. create a new dataframe with one date per row and the average sales price
dffilt_group = dffilt.groupby(['date']).mean()
dffilt_group

In [None]:
plt.rc('font', size=12)
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(dffilt_group.index, dffilt_group.price, label='Average Daily Price')

fig.autofmt_xdate()
ax.set_xlabel('Date')
ax.set_ylabel('Price')
ax.set_title('Average Daily Price of Ghostbusters Set')
ax.grid(True);

In [None]:
# time to clean the set_details data
df_set.head()

In [None]:
# remove -1 from set_num
df_set['set_num'] = df_set['set_num'].str.split('-', n=1).str[0]

In [None]:
# split launch_exit column into 2 columns
df_set[['launch_date', 'retirement_date']] = df_set['launch_exit'].str.split(' - ', expand=True)

In [None]:
df_set.drop('launch_exit', axis=1, inplace=True)

In [None]:
# need to plit the minifigs column into total and unique
df_set[['minifigs_total', 'minifigs_unique']] = df_set['minifigs'].str.split(' ', n=1, expand=True)

In [None]:
# parse the minifig value from minifigs_unique
df_set['minifigs_unique'] = df_set['minifigs_unique'].str.split(' ', n=1).str[0].str.replace('(', '', regex=True)

In [None]:
df_set.drop('minifigs', axis=1, inplace=True)

In [None]:
# parse rating column to get the text after the stars, this grabs non numeric values for rows with no rating
df_set['rating'] = df_set['rating'].str.split(' ', n=2).str[1].str.strip()

In [None]:
# a good way to replace non-numeric rows is to_numeric method, must use coerce to force NaN values for non-numerics
df_set['rating'] = pd.to_numeric(df_set['rating'], errors='coerce')

In [None]:
# change launch retirement date columns to date
# first need to remove spaces
df_set['launch_date'] = df_set['launch_date'].str.replace(' ', '')
df_set['launch_date'] = pd.to_datetime(df_set['launch_date'], format='%d%b%Y', errors='coerce')

In [None]:
# had some t.b.a text for some rows, they threw an error, added errors=coerce and seemed to fix it
df_set['retirement_date'] = df_set['retirement_date'].str.replace(' ', '')
df_set['retirement_date'] = pd.to_datetime(df_set['retirement_date'], format='%d%b%Y', errors='coerce')

In [None]:
# clean up msrp column and grab USD values only - drop pounds and euro values if present
# use regex to extract everything after the $, the dot, and the remaining digits
# REGEX sucks. But remember to use regex101.com, it's a life saver
df_set['msrp'] = df_set['msrp'].str.extract(r"\$(\d+\.\d+)")

In [None]:
# if launch_date is empty add the value from year released date column, 
df_set['launch_date'] = df_set['launch_date'].fillna(df_set['year_released'])
# very cool, it added jan 1 to the year automatically.

In [None]:
df_set['year_released'] = df_set['year_released'].astype(int)

In [None]:
df_set['msrp'] = df_set['msrp'].astype(float)

In [None]:
df_set['minifigs_total'] = df_set['minifigs_total'].astype(float)
df_set['minifigs_unique'] = df_set['minifigs_unique'].astype(float)

In [None]:
df_set.dtypes

In [None]:
df_set.sort_values(by=['msrp'])