In [50]:
# Loading libraries
import pandas as pd # Reads, writes, shapes, manipulates data
import numpy as np # Basic stats and numerical operations

In [51]:
# Load the Brooks shoes CSV file as a dataframe
shoes_df = pd.read_csv("BrooksShoes.csv")

In [52]:
# Printing a summary of the dataframe
shoes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Name                   50 non-null     object 
 1   Type                   50 non-null     object 
 2   Price                  50 non-null     float64
 3   Support                50 non-null     object 
 4   Experience             50 non-null     object 
 5   Surface                50 non-null     object 
 6   Midsole Drop(mm)       48 non-null     float64
 7   Weight(g)              50 non-null     float64
 8   High Arch              39 non-null     object 
 9   Medium Arch            50 non-null     object 
 10  Flat Arch              16 non-null     object 
 11  Segmented Crash Pad    3 non-null      object 
 12  DNA LOFT               8 non-null      object 
 13  BioMoGo DNA            34 non-null     object 
 14  3D Fit Print           6 non-null      object 
 15  DNA AMP 

In [53]:
# Dropping unnecessary columns
shoes_df = shoes_df.drop(columns = ['Segmented Crash Pad', 'DNA LOFT', 'BioMoGo DNA', '3D Fit Print', 'DNA AMP', 'GuideRails', 'DNA Midsole', 'Ballistic Rock Shield', 'Gore-Tex', 'DNA Flash', 'Rapid Roll', 'DNA ZERO', 'Super DNA'])

In [54]:
# Checking to see it worked
shoes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              50 non-null     object 
 1   Type              50 non-null     object 
 2   Price             50 non-null     float64
 3   Support           50 non-null     object 
 4   Experience        50 non-null     object 
 5   Surface           50 non-null     object 
 6   Midsole Drop(mm)  48 non-null     float64
 7   Weight(g)         50 non-null     float64
 8   High Arch         39 non-null     object 
 9   Medium Arch       50 non-null     object 
 10  Flat Arch         16 non-null     object 
dtypes: float64(3), object(8)
memory usage: 4.4+ KB


In [55]:
# Renaming columns
shoes_df = shoes_df.rename(columns={"Name": "name", "Type": "gender", "Price": "price", "Support": "support", "Experience": "experience", "Surface": "surface", "Midsole Drop(mm)": "midsole_drop_mm", "Weight(g)": "weight_g", "High Arch": "high_arch", "Medium Arch": "medium_arch", "Flat Arch": "flat_arch"})

In [56]:
# Checking to see it worked
shoes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             50 non-null     object 
 1   gender           50 non-null     object 
 2   price            50 non-null     float64
 3   support          50 non-null     object 
 4   experience       50 non-null     object 
 5   surface          50 non-null     object 
 6   midsole_drop_mm  48 non-null     float64
 7   weight_g         50 non-null     float64
 8   high_arch        39 non-null     object 
 9   medium_arch      50 non-null     object 
 10  flat_arch        16 non-null     object 
dtypes: float64(3), object(8)
memory usage: 4.4+ KB


In [57]:
# Finding number of NA values in each column
shoes_df.isna().sum()

name                0
gender              0
price               0
support             0
experience          0
surface             0
midsole_drop_mm     2
weight_g            0
high_arch          11
medium_arch         0
flat_arch          34
dtype: int64

In [58]:
# Filling in the two NA value's in the midsole drop column by looking up the correct numbers on the Brooks Running website
shoes_df.loc[9, 'midsole_drop_mm'] = 6
shoes_df.loc[39, 'midsole_drop_mm'] = 8

In [59]:
# Checking to see it worked
shoes_df.isna().sum()

name                0
gender              0
price               0
support             0
experience          0
surface             0
midsole_drop_mm     0
weight_g            0
high_arch          11
medium_arch         0
flat_arch          34
dtype: int64

In [60]:
# Filling all remaining NA's with the value 'No' after they were left blank
shoes_df = shoes_df.fillna('No')

In [61]:
# Checking to see it worked
shoes_df.isna().sum()

name               0
gender             0
price              0
support            0
experience         0
surface            0
midsole_drop_mm    0
weight_g           0
high_arch          0
medium_arch        0
flat_arch          0
dtype: int64

In [63]:
# Change every 'Yes' string to True and every 'No' string to False
columns = ['high_arch', 'medium_arch', 'flat_arch']
for i in columns:
    shoes_df[i] = shoes_df[i].replace({'Yes': True, 'No': False}).astype(bool) #added astype(bool) to get rid of the warning

In [64]:
# Checking to see it worked
shoes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             50 non-null     object 
 1   gender           50 non-null     object 
 2   price            50 non-null     float64
 3   support          50 non-null     object 
 4   experience       50 non-null     object 
 5   surface          50 non-null     object 
 6   midsole_drop_mm  50 non-null     float64
 7   weight_g         50 non-null     float64
 8   high_arch        50 non-null     bool   
 9   medium_arch      50 non-null     bool   
 10  flat_arch        50 non-null     bool   
dtypes: bool(3), float64(3), object(5)
memory usage: 3.4+ KB


In [65]:
# Load the randomly generated customer CSV file as a dataframe
customer_df = pd.read_csv("BrooksCustomers.csv")

In [66]:
# Printing a summary of the dataframe
customer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  500 non-null    int64 
 1   price_range  500 non-null    int64 
 2   support      500 non-null    object
 3   run_type     500 non-null    object
 4   arch_type    500 non-null    object
 5   type         500 non-null    object
dtypes: int64(2), object(4)
memory usage: 23.6+ KB


In [67]:
# Change every 'Yes' string to True and every 'No' string to False
columns = ['run_type', 'arch_type']
for i in columns:
    customer_df[i] = customer_df[i].replace({'Highway': 'Road', 'Path': 'Trail', 'Low Arch': 'Flat Arch'})

In [68]:
# Renaming some columns
customer_df = customer_df.rename(columns={'support': 'customer_support', 'type ': 'customer_gender'})

In [69]:
#checking the columns again 
customer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customer_id       500 non-null    int64 
 1   price_range       500 non-null    int64 
 2   customer_support  500 non-null    object
 3   run_type          500 non-null    object
 4   arch_type         500 non-null    object
 5   customer_gender   500 non-null    object
dtypes: int64(2), object(4)
memory usage: 23.6+ KB


In [71]:
# Saving new clean .csv files
shoes_df.to_csv('clean_BrooksShoes.csv', index=False)
customer_df.to_csv('clean_BrooksCustomers.csv', index=False)

In [72]:
#creating dataframes for the new clean files
clean_shoes = pd.read_csv('clean_BrooksShoes.csv')
clean_customer = pd.read_csv('clean_BrooksCustomers.csv')