# Data Prep

## Reshaping data

In [1]:
# Create a sample data frame
import pandas as pd

songs_dict = {
    'Customer':['Aria', 'Aria', 'Aria', 'Chord', 'Chord', 'Harmony', 'Harmony', 'Harmony', 'Melody', 'Reed'],
    'Genre': ['Pop', 'Indie', 'Rock', 'Pop', 'Indie', 'Pop', 'Indie', 'Rock', 'Rock', 'Rock'],
    '# Songs': [50, 48, 1, 15, 36, 10, 5, 3, 2, 5]
}

df = pd.DataFrame(songs_dict)
df

Unnamed: 0,Customer,Genre,# Songs
0,Aria,Pop,50
1,Aria,Indie,48
2,Aria,Rock,1
3,Chord,Pop,15
4,Chord,Indie,36
5,Harmony,Pop,10
6,Harmony,Indie,5
7,Harmony,Rock,3
8,Melody,Rock,2
9,Reed,Rock,5


In [2]:
# Group by
customer_songs = (df.groupby('Customer')['# Songs']
    .sum()
    .reset_index()
)
customer_songs

Unnamed: 0,Customer,# Songs
0,Aria,99
1,Chord,51
2,Harmony,18
3,Melody,2
4,Reed,5


In [3]:
# Pivot
df

Unnamed: 0,Customer,Genre,# Songs
0,Aria,Pop,50
1,Aria,Indie,48
2,Aria,Rock,1
3,Chord,Pop,15
4,Chord,Indie,36
5,Harmony,Pop,10
6,Harmony,Indie,5
7,Harmony,Rock,3
8,Melody,Rock,2
9,Reed,Rock,5


In [4]:
# Pivot
(df.pivot(
    index='Customer', # Rows
    columns='Genre',  # Columns
    values='# Songs') # Columns values
)

Genre,Indie,Pop,Rock
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aria,48.0,50.0,1.0
Chord,36.0,15.0,
Harmony,5.0,10.0,3.0
Melody,,,2.0
Reed,,,5.0


In [5]:
(df.pivot(
    index='Customer', # Rows
    columns='Genre',  # Columns
    values='# Songs') # Columns values
).fillna(0)

Genre,Indie,Pop,Rock
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aria,48.0,50.0,1.0
Chord,36.0,15.0,0.0
Harmony,5.0,10.0,3.0
Melody,0.0,0.0,2.0
Reed,0.0,0.0,5.0


In [6]:
customers_genres = (df.pivot(
    index='Customer', # Rows
    columns='Genre',  # Columns
    values='# Songs') # Columns values
).fillna(0).reset_index()

customers_genres

Genre,Customer,Indie,Pop,Rock
0,Aria,48.0,50.0,1.0
1,Chord,36.0,15.0,0.0
2,Harmony,5.0,10.0,3.0
3,Melody,0.0,0.0,2.0
4,Reed,0.0,0.0,5.0


## Missing Data

In [7]:
customers_raw = pd.read_csv('Data/customers.csv')
customers_raw.head()

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Harmony,26.0,,"$120,000",4/25/23,No,Graduate School
3,Melody,47.0,,"$450,000",5/5/23,No,College
4,Reed,52.0,0.0,"$75,000",6/14/23,Yes,High School


In [8]:
customers = customers_raw.copy()
customers.head()

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Harmony,26.0,,"$120,000",4/25/23,No,Graduate School
3,Melody,47.0,,"$450,000",5/5/23,No,College
4,Reed,52.0,0.0,"$75,000",6/14/23,Yes,High School


In [9]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8 non-null      object 
 1   Age              6 non-null      float64
 2   Followers        5 non-null      float64
 3   Income           8 non-null      object 
 4   Sign Up Date     8 non-null      object 
 5   Discount         8 non-null      object 
 6   Education Level  8 non-null      object 
dtypes: float64(2), object(5)
memory usage: 580.0+ bytes


In [10]:
customers[customers.isna().any(axis=1)]

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
2,Harmony,26.0,,"$120,000",4/25/23,No,Graduate School
3,Melody,47.0,,"$450,000",5/5/23,No,College
5,Selena,,1.0,"$62,000",8/26/23,No,College
6,Stefani,,,"$81,000",9/24/23,No,College


In [11]:
# Drop NaN values
customers_dropped = customers.dropna().reset_index(drop=True)
customers_dropped

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Reed,52.0,0.0,"$75,000",6/14/23,Yes,High School
3,Taylor,33.0,52.0,"$60,000",9/8/23,No,High School


In [12]:
# Deal with NaN values
customers['Age'] = customers.Age.fillna(round(customers.Age.median()))
customers

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Harmony,26.0,,"$120,000",4/25/23,No,Graduate School
3,Melody,47.0,,"$450,000",5/5/23,No,College
4,Reed,52.0,0.0,"$75,000",6/14/23,Yes,High School
5,Selena,30.0,1.0,"$62,000",8/26/23,No,College
6,Stefani,30.0,,"$81,000",9/24/23,No,College
7,Taylor,33.0,52.0,"$60,000",9/8/23,No,High School


In [13]:
customers['Followers'] = customers.Followers.fillna(0)
customers

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,"$45,000",5/18/23,Yes,College
1,Chord,19.0,12.0,"$28,000",8/23/23,Yes,High School
2,Harmony,26.0,0.0,"$120,000",4/25/23,No,Graduate School
3,Melody,47.0,0.0,"$450,000",5/5/23,No,College
4,Reed,52.0,0.0,"$75,000",6/14/23,Yes,High School
5,Selena,30.0,1.0,"$62,000",8/26/23,No,College
6,Stefani,30.0,0.0,"$81,000",9/24/23,No,College
7,Taylor,33.0,52.0,"$60,000",9/8/23,No,High School


## Converting to Numeric 

In [14]:
customers.dtypes
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8 non-null      object 
 1   Age              8 non-null      float64
 2   Followers        8 non-null      float64
 3   Income           8 non-null      object 
 4   Sign Up Date     8 non-null      object 
 5   Discount         8 non-null      object 
 6   Education Level  8 non-null      object 
dtypes: float64(2), object(5)
memory usage: 580.0+ bytes


In [15]:
customers.Income = customers.Income.str.replace('$','').str.replace(',','')
customers.Income = pd.to_numeric(customers.Income)

In [16]:
print(customers.info())
customers.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8 non-null      object 
 1   Age              8 non-null      float64
 2   Followers        8 non-null      float64
 3   Income           8 non-null      int64  
 4   Sign Up Date     8 non-null      object 
 5   Discount         8 non-null      object 
 6   Education Level  8 non-null      object 
dtypes: float64(2), int64(1), object(4)
memory usage: 580.0+ bytes
None


Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25.0,0.0,45000,5/18/23,Yes,College
1,Chord,19.0,12.0,28000,8/23/23,Yes,High School
2,Harmony,26.0,0.0,120000,4/25/23,No,Graduate School
3,Melody,47.0,0.0,450000,5/5/23,No,College
4,Reed,52.0,0.0,75000,6/14/23,Yes,High School


In [17]:
# Convert float to int (Floats take more memory than ints)
customers.Age = customers.Age.astype(int)
customers.Followers = customers.Followers.astype(int)

customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Name             8 non-null      object
 1   Age              8 non-null      int64 
 2   Followers        8 non-null      int64 
 3   Income           8 non-null      int64 
 4   Sign Up Date     8 non-null      object
 5   Discount         8 non-null      object
 6   Education Level  8 non-null      object
dtypes: int64(3), object(4)
memory usage: 580.0+ bytes


## Converting to DateTime

In [18]:
customers['Sign Up Date'] = pd.to_datetime(customers['Sign Up Date'], format='%m/%d/%y')
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Name             8 non-null      object        
 1   Age              8 non-null      int64         
 2   Followers        8 non-null      int64         
 3   Income           8 non-null      int64         
 4   Sign Up Date     8 non-null      datetime64[ns]
 5   Discount         8 non-null      object        
 6   Education Level  8 non-null      object        
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 580.0+ bytes


In [19]:
customers

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level
0,Aria,25,0,45000,2023-05-18,Yes,College
1,Chord,19,12,28000,2023-08-23,Yes,High School
2,Harmony,26,0,120000,2023-04-25,No,Graduate School
3,Melody,47,0,450000,2023-05-05,No,College
4,Reed,52,0,75000,2023-06-14,Yes,High School
5,Selena,30,1,62000,2023-08-26,No,College
6,Stefani,30,0,81000,2023-09-24,No,College
7,Taylor,33,52,60000,2023-09-08,No,High School


In [20]:
# Extract date into own columns
customers['Sign Up Month'] = customers['Sign Up Date'].dt.month
customers.head()

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level,Sign Up Month
0,Aria,25,0,45000,2023-05-18,Yes,College,5
1,Chord,19,12,28000,2023-08-23,Yes,High School,8
2,Harmony,26,0,120000,2023-04-25,No,Graduate School,4
3,Melody,47,0,450000,2023-05-05,No,College,5
4,Reed,52,0,75000,2023-06-14,Yes,High School,6


In [21]:
customers['Sign Up DOW'] = customers['Sign Up Date'].dt.dayofweek
customers.head()

Unnamed: 0,Name,Age,Followers,Income,Sign Up Date,Discount,Education Level,Sign Up Month,Sign Up DOW
0,Aria,25,0,45000,2023-05-18,Yes,College,5,3
1,Chord,19,12,28000,2023-08-23,Yes,High School,8,2
2,Harmony,26,0,120000,2023-04-25,No,Graduate School,4,1
3,Melody,47,0,450000,2023-05-05,No,College,5,4
4,Reed,52,0,75000,2023-06-14,Yes,High School,6,2


In [22]:
# Since we extracted DOW and month from Sign Up Date, we no longer need that date so we drop it.
customers = customers.drop(columns=['Sign Up Date'])
customers

Unnamed: 0,Name,Age,Followers,Income,Discount,Education Level,Sign Up Month,Sign Up DOW
0,Aria,25,0,45000,Yes,College,5,3
1,Chord,19,12,28000,Yes,High School,8,2
2,Harmony,26,0,120000,No,Graduate School,4,1
3,Melody,47,0,450000,No,College,5,4
4,Reed,52,0,75000,Yes,High School,6,2
5,Selena,30,1,62000,No,College,8,5
6,Stefani,30,0,81000,No,College,9,6
7,Taylor,33,52,60000,No,High School,9,4


## Conditional Logic

In [23]:
import numpy as np

customers['Discount'] = np.where(customers['Discount'] == 'Yes', 1, 0)
customers.head()

Unnamed: 0,Name,Age,Followers,Income,Discount,Education Level,Sign Up Month,Sign Up DOW
0,Aria,25,0,45000,1,College,5,3
1,Chord,19,12,28000,1,High School,8,2
2,Harmony,26,0,120000,0,Graduate School,4,1
3,Melody,47,0,450000,0,College,5,4
4,Reed,52,0,75000,1,High School,6,2


## Dummy variables

In [24]:
dummies_edu = pd.get_dummies(customers['Education Level']).astype(int)
dummies_edu.head()

Unnamed: 0,College,Graduate School,High School
0,1,0,0
1,0,0,1
2,0,1,0
3,1,0,0
4,0,0,1


In [25]:
# Combine dummies_edu and customers tables.
customers = pd.concat([customers, dummies_edu], axis = 1)
customers.head()

Unnamed: 0,Name,Age,Followers,Income,Discount,Education Level,Sign Up Month,Sign Up DOW,College,Graduate School,High School
0,Aria,25,0,45000,1,College,5,3,1,0,0
1,Chord,19,12,28000,1,High School,8,2,0,0,1
2,Harmony,26,0,120000,0,Graduate School,4,1,0,1,0
3,Melody,47,0,450000,0,College,5,4,1,0,0
4,Reed,52,0,75000,1,High School,6,2,0,0,1


In [26]:
customers = customers.drop(columns=['Education Level'])
customers.head()

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,Sign Up DOW,College,Graduate School,High School
0,Aria,25,0,45000,1,5,3,1,0,0
1,Chord,19,12,28000,1,8,2,0,0,1
2,Harmony,26,0,120000,0,4,1,0,1,0
3,Melody,47,0,450000,0,5,4,1,0,0
4,Reed,52,0,75000,1,6,2,0,0,1


## Feature Engineer

In [27]:
# the data from setting the row granularity, with a few more customers
songs_genres_dict = {'Customer': ['Aria', 'Chord', 'Harmony', 'Melody', 'Reed', 'Selena', 'Stefani', 'Taylor'],
                     '# Songs': [99, 51, 18, 2, 5, 60, 15, 121],
                     'Indie': [48, 36, 5, 0, 0, 20, 2, 19],
                     'Pop': [50, 15, 10, 0, 0, 20, 5, 89],
                     'Rock': [1, 0, 3, 2, 5, 20, 8, 13]}

songs_genres = pd.DataFrame(songs_genres_dict)
songs_genres.head()

Unnamed: 0,Customer,# Songs,Indie,Pop,Rock
0,Aria,99,48,50,1
1,Chord,51,36,15,0
2,Harmony,18,5,10,3
3,Melody,2,0,0,2
4,Reed,5,0,0,5


In [28]:
customers.head()

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,Sign Up DOW,College,Graduate School,High School
0,Aria,25,0,45000,1,5,3,1,0,0
1,Chord,19,12,28000,1,8,2,0,0,1
2,Harmony,26,0,120000,0,4,1,0,1,0
3,Melody,47,0,450000,0,5,4,1,0,0
4,Reed,52,0,75000,1,6,2,0,0,1


In [29]:
model_df = pd.concat([customers, songs_genres], axis=1).drop(columns=['Customer'])
model_df.head()

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,Sign Up DOW,College,Graduate School,High School,# Songs,Indie,Pop,Rock
0,Aria,25,0,45000,1,5,3,1,0,0,99,48,50,1
1,Chord,19,12,28000,1,8,2,0,0,1,51,36,15,0
2,Harmony,26,0,120000,0,4,1,0,1,0,18,5,10,3
3,Melody,47,0,450000,0,5,4,1,0,0,2,0,0,2
4,Reed,52,0,75000,1,6,2,0,0,1,5,0,0,5


In [30]:
# Create a new column for percent pop music.
model_df['Pct_Pop'] = model_df.Pop / model_df['# Songs']
model_df.head()

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,Sign Up DOW,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Pct_Pop
0,Aria,25,0,45000,1,5,3,1,0,0,99,48,50,1,0.505051
1,Chord,19,12,28000,1,8,2,0,0,1,51,36,15,0,0.294118
2,Harmony,26,0,120000,0,4,1,0,1,0,18,5,10,3,0.555556
3,Melody,47,0,450000,0,5,4,1,0,0,2,0,0,2,0.0
4,Reed,52,0,75000,1,6,2,0,0,1,5,0,0,5,0.0


In [31]:
model_df['Weekend'] = np.where(customers['Sign Up DOW'].isin([5,6]), 1, 0)
model_df

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,Sign Up DOW,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Pct_Pop,Weekend
0,Aria,25,0,45000,1,5,3,1,0,0,99,48,50,1,0.505051,0
1,Chord,19,12,28000,1,8,2,0,0,1,51,36,15,0,0.294118,0
2,Harmony,26,0,120000,0,4,1,0,1,0,18,5,10,3,0.555556,0
3,Melody,47,0,450000,0,5,4,1,0,0,2,0,0,2,0.0,0
4,Reed,52,0,75000,1,6,2,0,0,1,5,0,0,5,0.0,0
5,Selena,30,1,62000,0,8,5,1,0,0,60,20,20,20,0.333333,1
6,Stefani,30,0,81000,0,9,6,1,0,0,15,2,5,8,0.333333,1
7,Taylor,33,52,60000,0,9,4,0,0,1,121,19,89,13,0.735537,0


In [32]:
model_df = model_df.drop(columns=['Sign Up DOW'])
model_df

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Pct_Pop,Weekend
0,Aria,25,0,45000,1,5,1,0,0,99,48,50,1,0.505051,0
1,Chord,19,12,28000,1,8,0,0,1,51,36,15,0,0.294118,0
2,Harmony,26,0,120000,0,4,0,1,0,18,5,10,3,0.555556,0
3,Melody,47,0,450000,0,5,1,0,0,2,0,0,2,0.0,0
4,Reed,52,0,75000,1,6,0,0,1,5,0,0,5,0.0,0
5,Selena,30,1,62000,0,8,1,0,0,60,20,20,20,0.333333,1
6,Stefani,30,0,81000,0,9,1,0,0,15,2,5,8,0.333333,1
7,Taylor,33,52,60000,0,9,0,0,1,121,19,89,13,0.735537,0


In [33]:
# Identifying Proxy Variables
# external data with the average temperature in F each month in Chicago
avg_temp_dict = {'Month': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                 'Avg_Temp': [32, 36, 45, 56, 66, 77, 82, 81, 74, 62, 50, 37]}

avg_temp = pd.DataFrame(avg_temp_dict)
avg_temp

Unnamed: 0,Month,Avg_Temp
0,1,32
1,2,36
2,3,45
3,4,56
4,5,66
5,6,77
6,7,82
7,8,81
8,9,74
9,10,62


In [34]:
model_df = pd.merge(model_df, avg_temp, left_on='Sign Up Month', right_on='Month')
model_df.head()

Unnamed: 0,Name,Age,Followers,Income,Discount,Sign Up Month,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Pct_Pop,Weekend,Month,Avg_Temp
0,Aria,25,0,45000,1,5,1,0,0,99,48,50,1,0.505051,0,5,66
1,Chord,19,12,28000,1,8,0,0,1,51,36,15,0,0.294118,0,8,81
2,Harmony,26,0,120000,0,4,0,1,0,18,5,10,3,0.555556,0,4,56
3,Melody,47,0,450000,0,5,1,0,0,2,0,0,2,0.0,0,5,66
4,Reed,52,0,75000,1,6,0,0,1,5,0,0,5,0.0,0,6,77


In [35]:
model_df = model_df.drop(columns=['Sign Up Month'])
model_df.head()

Unnamed: 0,Name,Age,Followers,Income,Discount,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Pct_Pop,Weekend,Month,Avg_Temp
0,Aria,25,0,45000,1,1,0,0,99,48,50,1,0.505051,0,5,66
1,Chord,19,12,28000,1,0,0,1,51,36,15,0,0.294118,0,8,81
2,Harmony,26,0,120000,0,0,1,0,18,5,10,3,0.555556,0,4,56
3,Melody,47,0,450000,0,1,0,0,2,0,0,2,0.0,0,5,66
4,Reed,52,0,75000,1,0,0,1,5,0,0,5,0.0,0,6,77


## Feature Selection

In [36]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8 non-null      object 
 1   Age              8 non-null      int64  
 2   Followers        8 non-null      int64  
 3   Income           8 non-null      int64  
 4   Discount         8 non-null      int64  
 5   College          8 non-null      int64  
 6   Graduate School  8 non-null      int64  
 7   High School      8 non-null      int64  
 8   # Songs          8 non-null      int64  
 9   Indie            8 non-null      int64  
 10  Pop              8 non-null      int64  
 11  Rock             8 non-null      int64  
 12  Pct_Pop          8 non-null      float64
 13  Weekend          8 non-null      int64  
 14  Month            8 non-null      int64  
 15  Avg_Temp         8 non-null      int64  
dtypes: float64(1), int64(14), object(1)
memory usage: 1.1+ KB


In [37]:
names = model_df.Name
names.head()

0       Aria
1      Chord
2    Harmony
3     Melody
4       Reed
Name: Name, dtype: object

In [38]:
model_df = model_df.drop(columns=['Name'])
model_df.head()

Unnamed: 0,Age,Followers,Income,Discount,College,Graduate School,High School,# Songs,Indie,Pop,Rock,Pct_Pop,Weekend,Month,Avg_Temp
0,25,0,45000,1,1,0,0,99,48,50,1,0.505051,0,5,66
1,19,12,28000,1,0,0,1,51,36,15,0,0.294118,0,8,81
2,26,0,120000,0,0,1,0,18,5,10,3,0.555556,0,4,56
3,47,0,450000,0,1,0,0,2,0,0,2,0.0,0,5,66
4,52,0,75000,1,0,0,1,5,0,0,5,0.0,0,6,77


In [39]:
model_subset = model_df[['Age', '# Songs', 'Pct_Pop']]
model_subset.head()

Unnamed: 0,Age,# Songs,Pct_Pop
0,25,99,0.505051
1,19,51,0.294118
2,26,18,0.555556
3,47,2,0.0
4,52,5,0.0


## Normalization

Steps to normalize data:

In [40]:
# 1) Import min-max scalar
from sklearn.preprocessing import MinMaxScaler

# Instanciate object
mm_scalar = MinMaxScaler()

In [41]:
model_subset.head()

Unnamed: 0,Age,# Songs,Pct_Pop
0,25,99,0.505051
1,19,51,0.294118
2,26,18,0.555556
3,47,2,0.0
4,52,5,0.0


In [43]:
mm_scalar.fit(model_subset) # This looked at the model_subset data frame and calculated min and max of all columns

In [44]:
mm_scalar.data_min_ # Gives the min value of each column

array([19.,  2.,  0.])

In [45]:
mm_scalar.data_max_

array([ 52.        , 121.        ,   0.73553719])

In [46]:
# Apply transformation. In the background it uses the min and max found for each column
mm_scalar.transform(model_subset)

array([[0.18181818, 0.81512605, 0.6866417 ],
       [0.        , 0.41176471, 0.39986781],
       [0.21212121, 0.13445378, 0.75530587],
       [0.84848485, 0.        , 0.        ],
       [1.        , 0.02521008, 0.        ],
       [0.33333333, 0.48739496, 0.45318352],
       [0.33333333, 0.1092437 , 0.45318352],
       [0.42424242, 1.        , 1.        ]])

In [50]:
# Do the steps above more efficiently by calling it in one line of code
normalized = mm_scalar.fit_transform(model_subset) # Finds min-max and transform data
normalized

array([[0.18181818, 0.81512605, 0.6866417 ],
       [0.        , 0.41176471, 0.39986781],
       [0.21212121, 0.13445378, 0.75530587],
       [0.84848485, 0.        , 0.        ],
       [1.        , 0.02521008, 0.        ],
       [0.33333333, 0.48739496, 0.45318352],
       [0.33333333, 0.1092437 , 0.45318352],
       [0.42424242, 1.        , 1.        ]])

In [51]:
pd.DataFrame(normalized,columns=model_subset.columns)

Unnamed: 0,Age,# Songs,Pct_Pop
0,0.181818,0.815126,0.686642
1,0.0,0.411765,0.399868
2,0.212121,0.134454,0.755306
3,0.848485,0.0,0.0
4,1.0,0.02521,0.0
5,0.333333,0.487395,0.453184
6,0.333333,0.109244,0.453184
7,0.424242,1.0,1.0


In [52]:
pd.DataFrame(normalized,columns=model_subset.columns).describe()

Unnamed: 0,Age,# Songs,Pct_Pop
count,8.0,8.0,8.0
mean,0.416667,0.372899,0.468523
std,0.34044,0.376023,0.349796
min,0.0,0.0,0.0
25%,0.204545,0.088235,0.299901
50%,0.333333,0.273109,0.453184
75%,0.530303,0.569328,0.703808
max,1.0,1.0,1.0


In [None]:
# Put it all into one cell so its easier to copy and paste
'''
from sklearn.preprocessing import MinMaxScaler

# Instanciate object
mm_scalar = MinMaxScaler()
normalized = mm_scalar.fit_transform(model_subset) # Finds min-max and transform data
pd.DataFrame(normalized,columns=model_subset.columns) # Convert numpy array into dataframe

'''

## Standardization

In [53]:
from sklearn.preprocessing import StandardScaler

In [54]:
std_scaler = StandardScaler()

In [55]:
std_scaler.fit(model_subset) # Calculate mean and standard deviation

In [56]:
std_scaler.mean_

array([32.75      , 46.375     ,  0.34461595])

In [57]:
std_scaler.var_

array([1.10437500e+02, 1.75198438e+03, 5.79223949e-02])

In [58]:
std_scaler.transform(model_subset)

array([[-0.73746841,  1.25726549,  0.66661437],
       [-1.30841169,  0.11049602, -0.20982319],
       [-0.64231119, -0.677908  ,  0.87646562],
       [ 1.35599029, -1.06016449, -1.4318981 ],
       [ 1.83177636, -0.98849139, -1.4318981 ],
       [-0.26168234,  0.32551529, -0.04687987],
       [-0.26168234, -0.74958109, -0.04687987],
       [ 0.0237893 ,  1.78286816,  1.62429915]])

In [59]:
std_scaler.fit_transform(model_subset)

array([[-0.73746841,  1.25726549,  0.66661437],
       [-1.30841169,  0.11049602, -0.20982319],
       [-0.64231119, -0.677908  ,  0.87646562],
       [ 1.35599029, -1.06016449, -1.4318981 ],
       [ 1.83177636, -0.98849139, -1.4318981 ],
       [-0.26168234,  0.32551529, -0.04687987],
       [-0.26168234, -0.74958109, -0.04687987],
       [ 0.0237893 ,  1.78286816,  1.62429915]])

In [61]:
standardized = std_scaler.fit_transform(model_subset)
standardized_df = pd.DataFrame(standardized,columns=model_subset.columns)
standardized_df.head()

Unnamed: 0,Age,# Songs,Pct_Pop
0,-0.737468,1.257265,0.666614
1,-1.308412,0.110496,-0.209823
2,-0.642311,-0.677908,0.876466
3,1.35599,-1.060164,-1.431898
4,1.831776,-0.988491,-1.431898


In [62]:
standardized_df.describe()

Unnamed: 0,Age,# Songs,Pct_Pop
count,8.0,8.0,8.0
mean,0.0,1.387779e-17,-6.938894e-18
std,1.069045,1.069045,1.069045
min,-1.308412,-1.060164,-1.431898
25%,-0.6661,-0.8093087,-0.5153419
50%,-0.261682,-0.283706,-0.04687987
75%,0.35684,0.5584528,0.7190772
max,1.831776,1.782868,1.624299


In [None]:
'''
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
standardized = std_scaler.fit_transform(model_subset)
pd.DataFrame(standardized,columns=model_subset.columns)
'''