# Lego Ebay Project

In [1]:
import pandas as pd
import numpy as np
import sqlite3 as sql
import matplotlib.pyplot as plt
import datetime
import scipy.stats as stats
%matplotlib inline

  from . import _distributor_init


ImportError: Unable to import required dependencies:
numpy: 

IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!

Importing the numpy C-extensions failed. This error can happen for
many reasons, often due to issues with your setup or how NumPy was
installed.

We have compiled some common reasons and troubleshooting tips at:

    https://numpy.org/devdocs/user/troubleshooting-importerror.html

Please note and check the following:

  * The Python version is: Python3.8 from "C:\Users\zubaz\anaconda3\envs\Lego Data Analysis\python.exe"
  * The NumPy version is: "1.21.2"

and make sure that they are the versions you expect.
Please carefully study the documentation linked above for further help.

Original error was: DLL load failed while importing _multiarray_umath: The specified module could not be found.


# **********   INJEST   **********
Create a connection to the database and save SQL queries as variables

In [None]:
# Create connection to database
database = "C:\\Users\\zubaz\\Documents\\Python\\EbayLegoWebscrape\\lego.db"
connection = sql.connect(database)

In [None]:
# 2 queries for 2 tables, one with price data, one with set metadata
query1 = '''SELECT item_num, set_num, date, price
            FROM ebay_prices'''
query2 = '''SELECT * FROM set_details'''

### Create dataframe for query1
This dataframe contains the ebay prices data

In [None]:
df = pd.read_sql_query(query1, connection)

In [None]:
df.head()

In [None]:
df.info()

### Create a dataframe for query2
This dataframe contains the set dimension data

In [None]:
df_set = pd.read_sql_query(query2, connection)
df_set.head()

In [None]:
df_set.info()

In [None]:
# close connection to database
connection.close()

Get some basic date and set data

In [None]:
#change date column from text to datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
# find earliest and latest dates and how many days of data exists
oldest_date = df['date'].min()
recent_date = df['date'].max()
date_difference = recent_date - oldest_date
num_of_rows = len(df.index)
num_of_sets = len(pd.unique(df['set_num']))
num_of_items = len(pd.unique(df['item_num']))

print(f'Earliest price data: {oldest_date}')
print(f'Latest price data: {recent_date}')
print(f'Days of data: {date_difference}')
print(f"Price data for {num_of_sets} Lego sets")
print(f'Total rows: {num_of_rows}')
print(f"Total unique listings: {num_of_items}")

# **********  EXPLORE & CLEAN  **********

## Price Dataframe
Examine the 'price' column and do some cleaning

In [None]:
# Check if all values in price column are integers
all(x.is_integer() for x in df['price'])

In [None]:
print(df[pd.to_numeric(df['price'], errors='coerce').isnull()])

In [None]:
# looks like the commas are bad, let's replace commas with nothing
df['price'] = df['price'].replace(',','', regex=True)

In [None]:
# check to see what the rest of the non numeric values look like
# print(df['price'] [pd.to_numeric(df['price'], errors='coerce').isnull()])

In [None]:
df.shape

In [None]:
# find the index of rows with 'to' in them
# these values are too hard to deal with, probably not representative listings
remove_rows = df[df['price'].str.contains("to") == True].index

print(remove_rows)

In [None]:
# remove these rows
df.drop(remove_rows, inplace=True)
df.shape

In [None]:
print(df[pd.to_numeric(df['price'], errors='coerce').isnull()])

In [None]:
# there is a problem with my scraping, it's collecting GBP and EUR values
# I will need to check on this, for now just filter non-digit rows out
df = df[pd.to_numeric(df['price'], errors='coerce').notnull()]
df.head()

In [None]:
# checks price column to make sure all rows are numeric
pd.to_numeric(df['price'], errors='coerce').notnull().all()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
# now that all rows in price column are numeric, change column data type to numeric
# pandas will pick int64 if there are no decimals, float 64 if decimals are present, which there are
df['price'] = pd.to_numeric(df['price'])


## Metadata Dataframe

In [None]:
# time to clean the set_details data
df_set.head()

In [None]:
df_set.shape

In [None]:
# remove -1 from set_num
df_set['set_num'] = df_set['set_num'].str.split('-', n=1).str[0]

In [None]:
# split launch_exit column into 2 columns
df_set[['launch_date', 'retirement_date']] = df_set['launch_exit'].str.split(' - ', expand=True)

In [None]:
df_set.drop('launch_exit', axis=1, inplace=True)

In [None]:
# need to plit the minifigs column into total and unique
df_set[['minifigs_total', 'minifigs_unique']] = df_set['minifigs'].str.split(' ', n=1, expand=True)

In [None]:
# parse the minifig value from minifigs_unique
df_set['minifigs_unique'] = df_set['minifigs_unique'].str.split(' ', n=1).str[0].str.replace('(', '', regex=True)

In [None]:
df_set.drop('minifigs', axis=1, inplace=True)

In [None]:
# parse rating column to get the text after the stars, this grabs non numeric values for rows with no rating
df_set['rating'] = df_set['rating'].str.split(' ', n=2).str[1].str.strip()

In [None]:
# a good way to replace non-numeric rows is to_numeric method, 
# must use coerce to force NaN values for non-numerics
df_set['rating'] = pd.to_numeric(df_set['rating'], errors='coerce')

In [None]:
# change launch retirement date columns to date
# first need to remove spaces
df_set['launch_date'] = df_set['launch_date'].str.replace(' ', '')
df_set['launch_date'] = pd.to_datetime(df_set['launch_date'], format='%d%b%Y', errors='coerce')

In [None]:
# had some t.b.a text for some rows, they threw an error, added errors=coerce and seemed to fix it
df_set['retirement_date'] = df_set['retirement_date'].str.replace(' ', '')
df_set['retirement_date'] = pd.to_datetime(df_set['retirement_date'], format='%d%b%Y', errors='coerce')

In [None]:
# clean up msrp column and grab USD values only - drop pounds and euro values if present
# use regex to extract everything after the $, the dot, and the remaining digits
# REGEX sucks. But remember to use regex101.com, it's a life saver
df_set['msrp'] = df_set['msrp'].str.extract(r"\$(\d+\.\d+)")

In [None]:
# if launch_date is empty add the value from year released date column, 
df_set['launch_date'] = df_set['launch_date'].fillna(df_set['year_released'])
# very cool, it added jan 1 to the year automatically.

In [None]:
# change some datatypes
df_set['year_released'] = df_set['year_released'].astype(int)
df_set['msrp'] = df_set['msrp'].astype(float)
df_set['minifigs_total'] = df_set['minifigs_total'].astype(float)
df_set['minifigs_unique'] = df_set['minifigs_unique'].astype(float)
df_set['set_num'] = df_set['set_num'].astype(int)

In [None]:
df_set.dtypes

In [None]:
df_set.shape

In [None]:
# set the index to set_num, which should be unique
df_set.set_index('set_num')

In [None]:
# check for all unique rows in set_num
# is_unique method only works for a series, create that first and then check for uniqueness
set_num_series = df_set['set_num'].squeeze()
set_num_series.is_unique

In [None]:
# This is the final form of the set_details data
# Therefore we can create a table in db for this which will be the dimension table in pbi

database = "C:\\Users\\zubaz\\Documents\\Python\\EbayLegoWebscrape\\lego.db"
connection = sql.connect(database)

df_set.to_sql('set_details_cleaned', connection, if_exists='replace', index=False)
# if_exists=replace I don't know what has and hasn't changed, nulls are difficult to update
# this is the easiest way to update the whole table.
# But it does mean I will need to scrape ALL the set nums from brickset to capture all the changes

connection.close()

Loop through a list of sets and calculate and remove outliers, then groupby and average daily price
Create a new data frame that we will use for the rest of the analysis that is now at a daily granularity

In [None]:
# create function to remove outliers using IQR method
def remove_outliers(dataframe):
    """
    Function to identify and remove outliers from the price column using IQR method

    Parameters:
    ----------
    dataframe : a dataframe

    Returns:
    --------
    dataframe with outliers removed

    """
    try:
        Q1 = dataframe['price'].quantile(0.25)
        Q3 = dataframe['price'].quantile(0.75)
        IQR = Q3 - Q1
        lower_lim = Q1 # the 1.5*IQR wasn't catching enough low prices
        upper_lim = Q3 + 1.0*IQR # 1.5*IQR was too high for most sets
        outliers_15_low = (dataframe['price'] < lower_lim)
        outliers_15_high = (dataframe['price'] > upper_lim)
        df_outliers_removed = dataframe[~(outliers_15_low | outliers_15_high)]
    except:
        pass

    return df_outliers_removed
    
    # I could add another parameter for column name to make this more generic


In [None]:
# create list
# I can't figure out how to import search_sets.py from another folder.
# So I'll just recreate it here

# I copied this function from search_sets.py
# I should try this one day to help with importing functions from another folder
"""
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\nn_webserver")

from employee import motivation_to_work
"""


def create_search_list():

    import pandas as pd
    import sqlite3

    # create a connection to the database
    database = "C:\\Users\\zubaz\\Documents\\Python\\EbayLegoWebscrape\\lego.db"
    connection = sql.connect(database)

    query1 = ''' SELECT DISTINCT SetNum
            FROM brickset_set_nums
            WHERE NumPieces > 0
    '''

    df = pd.read_sql_query(query1, connection)

    connection.close()

    # remove -1 from set_num
    df['SetNum'] = df['SetNum'].str.split('-', n=1).str[0]

    # create list from df column
    search_terms = df['SetNum'].tolist()

    # Remove duplicates
    search_terms = list(dict.fromkeys(search_terms))
    search_terms.sort(reverse=True)
    # print(search_terms)
    # print(len(search_terms))


    return search_terms


In [None]:
# I had a major problem with the loop not working, turns out the search_terms 
# is a list of strings and the df we are looping through has set_num as int
# So I cast int to all the value in the for loop below
# But this removes the COMCON and other string set numbers, I should change the set_num
# back to string and remove it from the index in the above cells.
list_of_sets2 = create_search_list()
print(len(list_of_sets2))

# change all list items to integer
new_list = []
for value in list_of_sets2:
    try:
        new_list.append(int(value))
    except:
        pass

print(len(new_list))


In [None]:
# Test list of sets to loop through
# test_list = [75827, 75192, 70222, 70223]

# create an empty data frame to append each looped df to
looped_df = pd.DataFrame()
df_no_outliers_pbi = pd.DataFrame()

# filter by set num and then perform remove_outliers
for set in new_list:
    filter = (df['set_num'] == set)
    df_filtered = df[filter]
    df_no_outliers = remove_outliers(df_filtered)

    # create a dataframe for Power BI that has all prices, but no outliers
    # perform this BEFORE the groupby so we can do daily analysis later
    df_no_outliers_pbi = df_no_outliers_pbi.append(df_no_outliers)
    
    # calculate ONE average price per day per for each set
    # reset index fills down the set_num column, creates a full dataframe
    df_filtered_grouped = df_no_outliers.groupby(['set_num', 'date']).mean().reset_index()
    looped_df = looped_df.append(df_filtered_grouped)

print(f'df shape: {df.shape}')
print(f'df_no_ouliers_pbi shape: {df_no_outliers_pbi.shape}')
print(f'looped_df shape: {looped_df.shape}')


In [None]:
# set item_num as index to ensure one price per listing
df_no_outliers_pbi = df_no_outliers_pbi.set_index('item_num')
# df_no_outliers_pbi.head()

In [None]:
# write df_no_outliers_pbi to sql for import into Power BI

database = "C:\\Users\\zubaz\\Documents\\Python\\EbayLegoWebscrape\\lego.db"
connection = sql.connect(database)

df_no_outliers_pbi.to_sql('ebay_no_outliers_pbi', connection, if_exists='replace', index=True)
# if_exists=replace this is the simplest method. With append it was fully writing a copy of
# the db every time and doubling the rows.
# index=True because I set the item_num as the index

connection.close()

In [None]:
# change item_num to int64, for some reason int32 gives scientific notation
looped_df['item_num'] = looped_df['item_num'].astype('int64')
looped_df.head()

In [None]:
looped_df.shape

### Write clean price data to db
Write to database a clean dataset of sales data that is loaded into Power BI for analysis.
Much easier to do the clean up and outlier detection in python.
Also easier to import from SQLite db into Power BI than do the csv dance.

# **********  MERGE  **********

In [None]:
# check data types before the merge
df_set.dtypes

In [None]:
looped_df['set_num'] = looped_df['set_num'].astype(int)
looped_df.dtypes

In [None]:
# I want all values in the price df and mrsp from set data
# this is a left join and we use pandas merge method
joined_df = pd.merge(looped_df,
                    df_set[['set_num', 'msrp']],
                    on = 'set_num',
                    how = 'left' 
)
joined_df.head()

In [None]:
# create calculated column of $ of appreciation for every row
joined_df['appreciation'] = (joined_df['price'] - joined_df['msrp']).round(2)

# create % appreciation calculated column
joined_df['pct_appreciation'] = (joined_df['appreciation'] / joined_df['msrp'] * 100).round()

joined_df.head()

In [None]:
# filter all dates to last 30 days from today
today = datetime.datetime.now()
days30ago = today - pd.Timedelta(days=30)

joined_df = joined_df.loc[joined_df.date > days30ago]

In [None]:
# calculate one % appreciation for each set_num
group_df = joined_df[['set_num', 'pct_appreciation']]
group_df = group_df.groupby('set_num').mean().reset_index()
group_df.head()

In [None]:
# Now I need to merge the grouped % appreciation back into the set metadata df
final_df = pd.merge(df_set,
                    group_df[['set_num', 'pct_appreciation']],
                    on = 'set_num',
                    how = 'left' 
)
# selected_rows = final_df[~final_df['pct_appreciation'].isnull()]
# selected_rows

In [None]:
final_df

In [None]:
# save as csv to use in another notebook for playing around with ML
final_df.to_csv('final_cleaned_df.csv')