In [None]:
import pandas as pd
import numpy as np
import sqlite3 as sql
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Create connection to database
database = "C:\\Users\\zubaz\\Documents\\Python\\EbayLegoWebscrape\\lego.db"
connection = sql.connect(database)

In [None]:
query1 = '''SELECT set_num, date, price
            FROM ebay_prices'''
query2 = '''SELECT * FROM set_details'''

In [None]:
df = pd.read_sql_query(query1, connection)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df_set = pd.read_sql_query(query2, connection)
df_set.head()

In [None]:
#change date column from text to datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
# find earliest and latest dates and how many days of data exists
oldest_date = df['date'].min()
recent_date = df['date'].max()
date_difference = recent_date - oldest_date
print(oldest_date)
print(recent_date)
print(date_difference)

In [None]:
# Check if all values in price column are intergers
all(x.is_integer() for x in df['price'])

In [None]:
print(df[pd.to_numeric(df['price'], errors='coerce').isnull()])

In [None]:
df.shape

In [None]:
# looks like the commas are bad, let's replace commas with nothing
df['price'] = df['price'].replace(',','', regex=True)


In [None]:
print(df['price'] [pd.to_numeric(df['price'], errors='coerce').isnull()])

In [None]:
df.shape

In [None]:
# find the index of rows with 'to' in them
# these values are too hard to deal with, probably not representative listings
remove_rows = df[df['price'].str.contains("to") == True].index

print(remove_rows)


In [None]:
# remove these rows
df.drop(remove_rows, inplace=True)

In [None]:
df.shape

In [None]:
print(df[pd.to_numeric(df['price'], errors='coerce').isnull()])

In [None]:
# checks price column to make sure all rows are numeric
pd.to_numeric(df['price'], errors='coerce').notnull().all()

In [None]:
df.dtypes

In [None]:
# now that all rows in price column are numeric, change column data type to numeric
# will pick int64 if there are no decimals, float 64 if decimals are present, which they are
df['price'] = pd.to_numeric(df['price'])
df.dtypes

In [None]:
# lets take a look at just the ghostbusters set data
filt = (df['set_num'] == 75827)
dffilt = df[filt].sort_values(by='price')
dffilt['price'].describe()


In [None]:
dffilt.boxplot(column=['price'])
# looks like we've got some outlier prices

In [None]:
# This is Tukey's rule, know as the IQR rule
# IQR is the Inter Quartile Range
Q1 = dffilt['price'].quantile(0.25)
Q3 = dffilt['price'].quantile(0.75)
IQR = Q3 - Q1
IQR

In [None]:
# We will use 1.5*IQR for our limits, you can also use 2.5 or 2.0
lower_lim = Q1 - 1.5*IQR
upper_lim = Q3 + 1.5*IQR
lower_lim

In [None]:
upper_lim

In [None]:
# how many rows below our low limit?
outliers_15_low = ( dffilt['price'] < lower_lim)
len(dffilt['price'][outliers_15_low])

In [None]:
# how many rows above high limit?
outliers_15_high = ( dffilt['price'] > upper_lim)
len(dffilt['price'][outliers_15_high])

In [None]:
# the tilda ~ reverses the filtered indexes
dffilt = dffilt[~(outliers_15_low | outliers_15_high)]


In [None]:
dffilt

In [None]:
dffilt.boxplot(column='price')

In [None]:
dffilt_group = dffilt.groupby(['date']).mean()
dffilt_group

In [None]:
plt.rc('font', size=12)
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(dffilt_group.index, dffilt_group.price, label='Average Daily Price')

ax.set_xlabel('date')
ax.set_ylabel('Price')
ax.set_title('Example')
ax.grid(True)
ax.legend(loc='upper left');

In [None]:
# time to clean the set_details data
df_set.head()

In [None]:
# remove -1 from set_num
df_set['set_num'] = df_set['set_num'].str.split('-', n=1).str[0]

In [None]:
# split launch_exit column into 2 columns
df_set[['launch_date', 'retirement_date']] = df_set['launch_exit'].str.split(' - ', expand=True)

In [None]:
df_set.drop('launch_exit', axis=1, inplace=True)

In [None]:
# need to plit the minifigs column into total and unique
df_set[['minifigs_total', 'minifigs_unique']] = df_set['minifigs'].str.split(' ', n=1, expand=True)

In [None]:
# parse the minifig value from minifigs_unique
df_set['minifigs_unique'] = df_set['minifigs_unique'].str.split(' ', n=1).str[0].str.replace('(', '', regex=True)

In [None]:
df_set.drop('minifigs', axis=1, inplace=True)

In [None]:
# parse rating column to get the text after the stars, this grabs non numeric values for rows with no rating
df_set['rating'] = df_set['rating'].str.split(' ', n=2).str[1].str.strip()

In [None]:
# a good way to replace non-numeric rows is to_numeric method, must use coerce to force NaN values for non-numerics
df_set['rating'] = pd.to_numeric(df_set['rating'], errors='coerce')

In [None]:
# change launch retirement date columns to date
# first need to remove spaces
df_set['launch_date'] = df_set['launch_date'].str.replace(' ', '')
df_set['launch_date'] = pd.to_datetime(df_set['launch_date'], format='%d%b%Y', errors='coerce')

In [None]:
# had some t.b.a text for some rows, they threw an error, added errors=coerce and seemed to fix it
df_set['retirement_date'] = df_set['retirement_date'].str.replace(' ', '')
df_set['retirement_date'] = pd.to_datetime(df_set['retirement_date'], format='%d%b%Y', errors='coerce')

In [None]:
# clean up msrp column and grab USD values only
# use regex to extract everything after the $, the dot, and the remaining digits
# REGEX sucks. But remember to use regex101.com, it's a life saver
df_set['msrp'] = df_set['msrp'].str.extract(r"\$(\d+\.\d+)")

In [None]:
# if launch_date is empty add the year released date, 
df_set['launch_date'] = df_set['launch_date'].fillna(df_set['year_released'])
# very cool, it added jan 1 to the year. maybe because the column datetype is date

In [None]:
df_set['year_released'] = df_set['year_released'].astype(int)

In [None]:
df_set['msrp'] = df_set['msrp'].astype(float)

In [None]:
df_set['minifigs_total'] = df_set['minifigs_total'].astype(float)
df_set['minifigs_unique'] = df_set['minifigs_unique'].astype(float)

In [None]:
df_set.dtypes