In [22]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [23]:
print(os.listdir())

['.conda', 'colors.csv', 'demo.py', 'downloads_schema.png', 'inventories.csv', 'inventory_parts.csv', 'inventory_sets.csv', 'LEGO_Datasets.ipynb', 'main.py', 'morelearning.ipynb', 'parts.csv', 'part_categories.csv', 'sets.csv', 'themes.csv']


## Creating the Panda dataframes

In [24]:
colors = pd.read_csv("colors.csv")
inventories = pd.read_csv("inventories.csv")
inventory_parts = pd.read_csv("inventory_parts.csv")
inventory_sets = pd.read_csv("inventory_sets.csv")
part_categories = pd.read_csv("part_categories.csv")
parts = pd.read_csv("parts.csv")
sets = pd.read_csv("sets.csv")
themes = pd.read_csv("themes.csv")

## Creating the investigate function

In [25]:
def investigate(db):
    """explores the data"""

    db.name = "d"
    print(f"Exploring the data in the dataframe")
    print(db.index)
    print("")
    print("COLUMN NAMES")
    print(db.columns)
    print()
    nRow, nCol = db.shape
    print(f'There are {nRow} rows and {nCol} columns')
    print()
    print("DATA TYPES")
    print(db.dtypes)
    print()
    print("NUMBER OF UNIQUE VALUES IN EACH COLUMN")
    print(db.nunique())
    print()
    print("PREVIEWING THE DATASET")
    print(db.head())
    print()
    print(db.info())
    print()
    print("NUMBER OF NON-NULL VALUES IN EACH COLUMN")
    print(db.count())


## How the data is organized

The dataset inventory_sets has 3 columns: inventory_id, set_num and quantity.  The inventory_id is the piece/part, the quantity is the number of a specfic piece (inventory_id) included.  The majority of sets have a single piece- it is rare to have multple of the same piece in a set.  A set_num has multiple pieces (inventory_id).  A piece (inventory_id in inventory_sets = id in inventories) in some cases has had different versions (the versions of a piece are in the inventories dataset)

The color is available for each inventory_id, and each inventory_id is categorized (windows, doors, bricks).

Tracking how number of parts have changed = num_parts in sets- relatively simple.  Investiating how those parts have changed- changes in inventory_id colors, categories- more complicated. Changes in theme or parent_theme would be a simpler investigation given you do not have to drill down as much.

# Ideas for this data set
* Group colors into 10 groups (124 colors out of 135 is too many)
* Track popular themes and colors by year
    ** datasets= colors, sets, themes
* Only 12 pieces had more than version 1 released.  Where those the popular colors and themes?
    ** dataset = inventories
* How has the size of sets changed over time?
* What colors are associated with which themes? 
* Could you predict which theme a set is from just by the bricks it contains?
* What sets have the most-used pieces in them? What sets have the rarest pieces in them?
* Have the colors of LEGOs included in sets changed over time?



# Exploring each dataset

## colors dataset

In [26]:
investigate(colors)

Exploring the data in the dataframe
RangeIndex(start=0, stop=135, step=1)

COLUMN NAMES
Index(['id', 'name', 'rgb', 'is_trans'], dtype='object')

There are 135 rows and 4 columns

DATA TYPES
id           int64
name        object
rgb         object
is_trans    object
dtype: object

NUMBER OF UNIQUE VALUES IN EACH COLUMN
id          135
name          1
rgb         124
is_trans      2
dtype: int64

PREVIEWING THE DATASET
   id name     rgb is_trans
0  -1    d  0033B2        f
1   0    d  05131D        f
2   1    d  0055BF        f
3   2    d  237841        f
4   3    d  008F9B        f

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        135 non-null    int64 
 1   name      135 non-null    object
 2   rgb       135 non-null    object
 3   is_trans  135 non-null    object
dtypes: int64(1), object(3)
memory usage: 4.3+ KB
None

NUMBER OF NON-NULL VA

### What's with the color names?

In [27]:
colors['name'].unique()

array(['d'], dtype=object)

### rgb values

In [28]:
uniquecolors = colors['rgb'].nunique()
nRow, nCol = colors.shape

print(f"There are {uniquecolors} unique colors in the dataframe colors out of {nRow} total colors")
print()
print("COUNT OF ALL UNIQUE COLORS")
colors['rgb'].value_counts()


There are 124 unique colors in the dataframe colors out of 135 total colors

COUNT OF ALL UNIQUE COLORS


rgb
FFFFFF    3
000000    3
D9E4A7    2
635F52    2
A5A5CB    2
         ..
FCFCFC    1
F5CD2F    1
C1DFF0    1
F8F184    1
8E5597    1
Name: count, Length: 124, dtype: int64

## inventory_sets dataset

The dataset inventory_sets has 3 columns: inventory_id, set_num and quauntity.  The inventory_id is the piece/part, the quantity is the number of a specfic piece (inventory_id) included.  A set_num has multiple pieces (inventory_id).  A piece (inventory_id in inventory_sets = id in inventories) in some cases has had different versions (the versions of a piece are in the inventories dataset)

The color is available for each inventory_id, and each inventory_id is categorized (windows, doors, bricks).

In [29]:
investigate(inventory_sets)

Exploring the data in the dataframe
RangeIndex(start=0, stop=2846, step=1)

COLUMN NAMES
Index(['inventory_id', 'set_num', 'quantity'], dtype='object')

There are 2846 rows and 3 columns

DATA TYPES
inventory_id     int64
set_num         object
quantity         int64
dtype: object

NUMBER OF UNIQUE VALUES IN EACH COLUMN
inventory_id     589
set_num         2306
quantity           7
dtype: int64

PREVIEWING THE DATASET
   inventory_id  set_num  quantity
0            35  75911-1         1
1            35  75912-1         1
2            39  75048-1         1
3            39  75053-1         1
4            50   4515-1         1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2846 entries, 0 to 2845
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   inventory_id  2846 non-null   int64 
 1   set_num       2846 non-null   object
 2   quantity      2846 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 66.8+ KB


#### Most sets have only 1 of any kind of piece in the set.  There are 2,846 total peices (count of inventory_id).  2,793 pieces only occur once in a set (see below).

To do: filter using Python to find what iventory_ids have quantities over 1.  What set_nums are they part of, what themes are they part of?

In [37]:
print("COUNT OF quantities...quantity and the number with that quantity")
inventory_sets['quantity'].value_counts()

COUNT OF quantities...quantity and the number with that quantity


quantity
1     2793
2       28
60      17
3        5
7        1
5        1
20       1
Name: count, dtype: int64

quantity= the number of that iventory_id were included.  Each set_num has meany inventory_ids.


What does it mean that quantity was "1" for most sets?  What does it mean if there was more than 1 of a set? the set "Basic Building Set" with set_num = 044-1 had what quantity?  This can't be the number sold, because the highest number is 60.

044-1,Basic Building Set,1968,366,225

set_num 4520-1 had quantity = 2
name = Curved Rails
year = 1991
theme_id =244
num_parts = 8

14 sets had theme 244

set_num = 4520-1, Curved Rails, 1991, theme = 244, 8 parts
set_num = 4531-1, Manual Points, 1991, theme= 244, 6 parts

set num 4530-1 had quantity 2 of iventory_id 50, quantity 5 of inventory_id 309, quantity 1 of inventory 6050, etc.


## inventories dataset

In [31]:
investigate(inventories)

Exploring the data in the dataframe
RangeIndex(start=0, stop=11681, step=1)

COLUMN NAMES
Index(['id', 'version', 'set_num'], dtype='object')

There are 11681 rows and 3 columns

DATA TYPES
id          int64
version     int64
set_num    object
dtype: object

NUMBER OF UNIQUE VALUES IN EACH COLUMN
id         11681
version        5
set_num    11670
dtype: int64

PREVIEWING THE DATASET
   id  version set_num
0   1        1  7922-1
1   3        1  3931-1
2   4        1  6942-1
3  15        1  5158-1
4  16        1   903-1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11681 entries, 0 to 11680
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       11681 non-null  int64 
 1   version  11681 non-null  int64 
 2   set_num  11681 non-null  object
dtypes: int64(2), object(1)
memory usage: 273.9+ KB
None

NUMBER OF NON-NULL VALUES IN EACH COLUMN
id         11681
version    11681
set_num    11681
dtype: int64


#### Very few sets had more than one version released

In [32]:
print("COUNT OF DIFFERENT VERSIONS")
inventories['version'].value_counts()

COUNT OF DIFFERENT VERSIONS


version
1    11669
2        9
3        1
4        1
5        1
Name: count, dtype: int64

## themes dataset

In [33]:
investigate(themes)

Exploring the data in the dataframe
RangeIndex(start=0, stop=614, step=1)

COLUMN NAMES
Index(['id', 'name', 'parent_id'], dtype='object')

There are 614 rows and 3 columns

DATA TYPES
id             int64
name          object
parent_id    float64
dtype: object

NUMBER OF UNIQUE VALUES IN EACH COLUMN
id           614
name           1
parent_id     78
dtype: int64

PREVIEWING THE DATASET
   id name  parent_id
0   1    d        NaN
1   2    d        1.0
2   3    d        1.0
3   4    d        1.0
4   5    d        1.0

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         614 non-null    int64  
 1   name       614 non-null    object 
 2   parent_id  503 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 14.5+ KB
None

NUMBER OF NON-NULL VALUES IN EACH COLUMN
id           614
name         614
parent_id    503
dtype: int64

### 78 total themes

In [34]:
nthemegroups = themes.parent_id.nunique()
print(f"There are {nthemegroups} total themes.")

There are 78 total themes.


## sets dataset

In [35]:
investigate(sets)

Exploring the data in the dataframe
RangeIndex(start=0, stop=11673, step=1)

COLUMN NAMES
Index(['set_num', 'name', 'year', 'theme_id', 'num_parts'], dtype='object')

There are 11673 rows and 5 columns

DATA TYPES
set_num      object
name         object
year          int64
theme_id      int64
num_parts     int64
dtype: object

NUMBER OF UNIQUE VALUES IN EACH COLUMN
set_num      11673
name             1
year            66
theme_id       575
num_parts     1092
dtype: int64

PREVIEWING THE DATASET
  set_num name  year  theme_id  num_parts
0    00-1    d  1970       414        471
1  0011-2    d  1978        84         12
2  0011-3    d  1987       199          2
3  0012-1    d  1979       143         12
4  0013-1    d  1979       143         12

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11673 entries, 0 to 11672
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   set_num    11673 non-null  object
 1   name       11673 

In [36]:
inventories.set_num.value_counts

<bound method IndexOpsMixin.value_counts of 0          7922-1
1          3931-1
2          6942-1
3          5158-1
4           903-1
           ...   
11676     31066-1
11677     71018-7
11678    71018-17
11679     60159-1
11680     75090-2
Name: set_num, Length: 11681, dtype: object>

## sets part_categories

In [38]:
investigate(part_categories)

Exploring the data in the dataframe
RangeIndex(start=0, stop=57, step=1)

COLUMN NAMES
Index(['id', 'name'], dtype='object')

There are 57 rows and 2 columns

DATA TYPES
id       int64
name    object
dtype: object

NUMBER OF UNIQUE VALUES IN EACH COLUMN
id      57
name     1
dtype: int64

PREVIEWING THE DATASET
   id name
0   1    d
1   2    d
2   3    d
3   4    d
4   5    d

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      57 non-null     int64 
 1   name    57 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.0+ KB
None

NUMBER OF NON-NULL VALUES IN EACH COLUMN
id      57
name    57
dtype: int64
