In [220]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.defaule = 'colab'

from itables import show

In [221]:
file_location = 'https://joannabieri.com/introdatascience/data/lego_sample.csv'
DF = pd.read_csv(file_location)

In [222]:
DF

Unnamed: 0,item_number,set_name,theme,pieces,price,amazon_price,year,ages,pages,minifigures,packaging,weight,unique_pieces,size
0,10859,My First Ladybird,DUPLO®,6,4.99,16.00,2018,Ages_1½-3,9,,Box,,5,Large
1,10860,My First Race Car,DUPLO®,6,4.99,9.45,2018,Ages_1½-3,9,,Box,0.13Kg (0.29 lb),6,Large
2,10862,My First Celebration,DUPLO®,41,14.99,39.89,2018,Ages_1½-3,9,,Box,,18,Large
3,10864,Large Playground Brick Box,DUPLO®,71,49.99,56.69,2018,Ages_2-5,32,2.0,Plastic box,1.41Kg (3.11 lb),49,Large
4,10867,Farmers' Market,DUPLO®,26,19.99,36.99,2018,Ages_2-5,9,3.0,Box,,18,Large
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,60251,Monster Truck,City,55,9.99,8.99,2020,Ages_5+,32,1.0,Box,0.14Kg (0.31 lb),34,Small
71,60252,Construction Bulldozer,City,126,19.99,15.99,2020,Ages_4+,84,2.0,Box,0.35Kg (0.78 lb),81,Small
72,60258,Tuning Workshop,City,897,99.99,99.99,2020,Ages_6+,389,7.0,Box,1.63Kg (3.58 lb),411,Small
73,60266,Ocean Exploration Ship,City,745,149.99,149.99,2020,Ages_7+,229,8.0,Box,2.28Kg (5.03 lb),314,Small


In [223]:
#Question 1: How many observations and variables are in the dataset?
obv = DF.shape[0]
var = DF.shape[1]

print(f"There are {obv} observations in the dataset.")
print(f"There are {var} variables in the dataset.")

There are 75 observations in the dataset.
There are 14 variables in the dataset.


In [224]:
#Question 2: What are the names of the variables? 
var_names = DF.columns

#Classic print statement
#print(var_names)

#Shows it in a pretty table, resets index to start at 1 
var_name_DF = pd.DataFrame(var_names, columns = ['Variable_Names'])
var_name_DF.reset_index(drop=True, inplace=True)
var_name_DF.index +=1

var_name_DF['Data_Type'] = ['Numerical', 'Categorical','Categorical','Numerical','Numerical','Numerical','Categorical','Categorical','Numerical','Numerical','Categorical','Numerical','Numerical','Categorical']
var_name_DF

Unnamed: 0,Variable_Names,Data_Type
1,item_number,Numerical
2,set_name,Categorical
3,theme,Categorical
4,pieces,Numerical
5,price,Numerical
6,amazon_price,Numerical
7,year,Categorical
8,ages,Categorical
9,pages,Numerical
10,minifigures,Numerical


**Item Number (Numerical): This is a unique integer assigned to each set for identification purposes.**

**Set Name (Categorical): This is the unique, recognizable name assigned to each LEGO set.**

**Theme (Categorical): Each set is classified under a single, predefined LEGO theme.**

**Pieces (Numerical): This represents the total count of individual pieces included in a set.**

**Price (Numerical): This is the manufacturer's suggested retail price for the set.**

**Amazon Price (Numerical): This is the selling price of the set listed on Amazon.**

**Year (Categorical): This indicates the release year, grouping sets by when they were introduced.**

**Ages (Categorical): This groups sets into categories based on the recommended age range (e.g., "6-12", "18+").**

**Pages (Numerical): This is the total page count of the instruction manual.**

**Minifigures (Numerical): This represents the total count of minifigures included in the set.**

**Packing (Categorical): This describes the type of packaging used for the set, such as a box or bag.**

**Weight (Numerical): This is the physical weight of the complete, packaged set.**

**Unique Pieces (Numerical): This is the count of distinct element types within the set.**

**Size (Categorical): This groups sets into predefined size categories, such as "Small" or "Large."**

In [225]:
DF.describe()

Unnamed: 0,item_number,pieces,price,amazon_price,year,pages,minifigures,unique_pieces
count,75.0,75.0,75.0,75.0,75.0,75.0,65.0,75.0
mean,37498.493333,196.453333,32.136667,39.393733,2019.053333,73.213333,2.615385,96.666667
std,20461.267542,213.914121,27.747263,33.301259,0.820239,76.949842,2.133749,89.471018
min,10859.0,6.0,4.99,6.29,2018.0,1.0,1.0,5.0
25%,10916.5,37.0,9.99,17.98,2018.0,14.0,1.0,30.0
50%,41378.0,93.0,19.99,29.99,2019.0,44.0,2.0,61.0
75%,60193.0,323.5,39.99,53.935,2020.0,116.0,3.0,151.0
max,60267.0,897.0,149.99,184.99,2020.0,389.0,15.0,411.0


**The cheapest lego set based off of regular price was 4.99 while the cheapest Amazon price was 6.29, this is a difference of 1.3.**

**The smallest lego set only had 6 pieces total.**

**The lego set with the most minifigures had a minifigure count of 15.**

**The lego set that came with an instruction for the most pages, had an instruction manual with 389 pages**

In [226]:
#Gonna look at how pieces per price changed over the year, gonna look at pieces, price, amazon price, year 
lego_mask = ['price', 'year', 'pieces', 'amazon_price', 'item_number']
DF_mask = DF[lego_mask]
DF_mask.groupby('year')
DF_mask

Unnamed: 0,price,year,pieces,amazon_price,item_number
0,4.99,2018,6,16.00,10859
1,4.99,2018,6,9.45,10860
2,14.99,2018,41,39.89,10862
3,49.99,2018,71,56.69,10864
4,19.99,2018,26,36.99,10867
...,...,...,...,...,...
70,9.99,2020,55,8.99,60251
71,19.99,2020,126,15.99,60252
72,99.99,2020,897,99.99,60258
73,149.99,2020,745,149.99,60266


In [227]:
#Individual value counts (freq) 
DF_mask['item_number'].value_counts()

item_number
10859    1
60202    1
60183    1
60182    1
60172    1
        ..
10929    1
10928    1
10927    1
10926    1
60267    1
Name: count, Length: 75, dtype: int64

In [228]:
DF_mask['price'].value_counts()

price
19.99     13
9.99      13
29.99     10
39.99      8
59.99      4
49.99      4
14.99      4
7.99       4
69.99      3
99.99      2
4.99       2
6.99       2
24.99      2
34.99      1
119.99     1
79.99      1
149.99     1
Name: count, dtype: int64

In [229]:
DF_mask['amazon_price'].value_counts()

amazon_price
8.99      5
17.99     4
21.99     3
14.99     3
16.00     3
99.99     2
59.99     2
19.99     1
39.95     1
79.95     1
69.95     1
15.32     1
49.95     1
30.89     1
16.49     1
64.75     1
65.99     1
17.97     1
29.95     1
69.99     1
48.53     1
31.46     1
23.83     1
56.00     1
53.99     1
54.80     1
31.99     1
35.98     1
52.74     1
23.98     1
15.99     1
18.93     1
31.27     1
7.99      1
26.99     1
39.89     1
56.69     1
36.99     1
9.99      1
128.95    1
74.50     1
98.99     1
19.55     1
21.11     1
23.27     1
6.29      1
29.99     1
31.67     1
40.40     1
45.95     1
53.88     1
184.99    1
34.00     1
18.64     1
18.95     1
35.99     1
62.99     1
40.00     1
9.45      1
149.99    1
Name: count, dtype: int64

In [230]:
DF_mask['pieces'].value_counts()

pieces
37     3
6      2
363    2
83     2
88     2
      ..
85     1
134    1
337    1
715    1
168    1
Name: count, Length: 66, dtype: int64

In [231]:
DF_mask['year'].value_counts()

year
2020    27
2019    25
2018    23
Name: count, dtype: int64

In [232]:
#1-2 sentences on what the value count showed me for each of the columns 

In [233]:
px.scatter(DF,
           x = 'price', 
           y = 'pieces',
           color = 'theme',
           hover_data = 'set_name')

**The more expensive sets do not necessarily have more pieces, there are some outliers that show expensive sets do not necessarily have a hgh piece count. These sets tend to be in the Lego DUPLO theme while in Lego Friends and Lego City we see as high piece count does infact correlate with more expensive price.**

**The most expensive set is the Lego City Tuning Workshop at a price of $99.99.**

**Lego DUPLO tends to have a lower piece count in comparison to the other two themes, Lego Friends and Lego City**

In [244]:
DF_price = DF.copy() 
DF_price['price_difference'] = (DF_price['amazon_price'] - DF_price['price'])
DF_price['amz_cheaper'] = np.where(DF_price['amazon_price'] < DF_price['price'], True, False)
DF_price

Unnamed: 0,item_number,set_name,theme,pieces,price,amazon_price,year,ages,pages,minifigures,packaging,weight,unique_pieces,size,price_difference,amz_cheaper
0,10859,My First Ladybird,DUPLO®,6,4.99,16.00,2018,Ages_1½-3,9,,Box,,5,Large,11.01,False
1,10860,My First Race Car,DUPLO®,6,4.99,9.45,2018,Ages_1½-3,9,,Box,0.13Kg (0.29 lb),6,Large,4.46,False
2,10862,My First Celebration,DUPLO®,41,14.99,39.89,2018,Ages_1½-3,9,,Box,,18,Large,24.90,False
3,10864,Large Playground Brick Box,DUPLO®,71,49.99,56.69,2018,Ages_2-5,32,2.0,Plastic box,1.41Kg (3.11 lb),49,Large,6.70,False
4,10867,Farmers' Market,DUPLO®,26,19.99,36.99,2018,Ages_2-5,9,3.0,Box,,18,Large,17.00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,60251,Monster Truck,City,55,9.99,8.99,2020,Ages_5+,32,1.0,Box,0.14Kg (0.31 lb),34,Small,-1.00,True
71,60252,Construction Bulldozer,City,126,19.99,15.99,2020,Ages_4+,84,2.0,Box,0.35Kg (0.78 lb),81,Small,-4.00,True
72,60258,Tuning Workshop,City,897,99.99,99.99,2020,Ages_6+,389,7.0,Box,1.63Kg (3.58 lb),411,Small,0.00,False
73,60266,Ocean Exploration Ship,City,745,149.99,149.99,2020,Ages_7+,229,8.0,Box,2.28Kg (5.03 lb),314,Small,0.00,False


In [243]:
fig = px.histogram(DF_price,
                   nbins=120,
                   x='price_difference',
                   color = 'theme')

fig.update_layout(bargap=0.1,
                  title='Histogram of Price Differences between Amazon Prices and Regular Prices',
                  xaxis_title = 'Price Difference',
                  yaxis_title = 'Count',
                  title_x=0.5)
fig.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of [0, 1] but received: price_difference

In [238]:
#DF_price = DF_price.groupby('theme')
#DF_price['pieces'].describe()

AttributeError: 'DataFrameGroupBy' object has no attribute 'groupby'

In [None]:
DF_exp = DF.copy() 
DF_exp

In [None]:
max_retail = DF_exp['price'].max()
print(f"The max retial price is ${max_retail}")

max_mask = (DF_exp['price'] >= max_retail)
DF_exp[max_mask]

In [None]:
max_amazon = DF_exp['amazon_price'].max()
print(f"The max amazon price is ${max_amazon}")

amazon_mask = (DF_exp['amazon_price'] >= max_amazon)
DF_exp[amazon_mask]

In [None]:
hunnid_mask = (DF_exp['amazon_price'] > 100) | (DF_exp['price'] > 100)
DF_exp[hunnid_mask]

In [None]:
price_fig = px.scatter(
    DF,
    x='year',
    y='price',
    title='Does the Price increase over the 3 Years',
)

#price_fig.update_traces(mode='lines+markers', name='Line + Points')

price_fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Price',
    template='plotly_white'
)

price_fig.update_xaxes(dtick=1)
price_fig.show()

In [None]:
piece_fig = px.scatter(
    DF,
    x='year',
    y='pieces',
    title='Does the Number of Pieces increase over the 3 Years',
)

#price_fig.update_traces(mode='lines+markers', name='Line + Points')

piece_fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Pieces',
    template='plotly_white'
)

piece_fig.update_xaxes(dtick=1)
piece_fig.show()

In [None]:
DF_years = DF.copy() 
mask_2018 = DF_years['year'] == 2018
DF_2018 = DF_years[mask_2018]
avg_2018 = DF_2018['price'].mean()
avg_piece_2018 = DF_2018['pieces'].mean()

print(f"The average price for a Lego set in 2018 was ${avg_2018}")
print(f"\nThe average pieces in a Lego set in 2018 was {avg_piece_2018}")

In [None]:
mask_2019 = DF_years['year'] == 2019
DF_2019 = DF_years[mask_2019]
avg_2019 = DF_2019['price'].mean()
avg_piece_2019 = DF_2019['pieces'].mean()

print(f"The average price for a Lego set in 2019 was ${avg_2019}")
print(f"\nThe average pieces in a Lego set in 2019 was {avg_piece_2019}")

In [None]:
mask_2020 = DF_years['year'] == 2020
DF_2020 = DF_years[mask_2020]
avg_2020 = DF_2020['price'].mean()
avg_piece_2020 = DF_2020['pieces'].mean()

print(f"The average price for a Lego set in 2020 was ${avg_2020}")
print(f"\nThe average pieces in a Lego set in 2020 was {avg_piece_2020}")