In [3]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/module_5_feature_engineering_demo.csv')

In [7]:
df.head()
df.drop('Unnamed: 0', axis=1, inplace=True)

In [51]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
df_std = scaler.fit_transform(df)

# Perform PCA
pca = PCA()
df_pca = pca.fit_transform(df_std)

# The transformed data is an array, convert it back into a dataframe
df_pca = pd.DataFrame(df_pca, columns=[f'PC{i+1}' for i in range(len(df.columns))])

# Print the explained variance ratio
print('Explained variance ratio:', pca.explained_variance_ratio_)

# Print the cumulative explained variance ratio
cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
print('Cumulative explained variance ratio:', cumsum_variance)

# Show the first few rows of transformed dataframe
df_pca.head()

Explained variance ratio: [0.28857346 0.18072185 0.14675488 0.14381463 0.13558216 0.10353415
 0.00101888]
Cumulative explained variance ratio: [0.28857346 0.46929531 0.61605019 0.75986482 0.89544697 0.99898112
 1.        ]


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
0,-0.395268,-1.182948,1.791319,2.336967,0.037477,0.366259,-0.018202
1,0.819082,-0.205962,2.084828,0.028116,-0.058311,0.446441,-0.061576
2,-1.171866,-0.469183,-0.324207,1.639464,-0.240676,-0.935364,0.040073
3,-1.106793,0.798769,2.060352,0.535101,1.421538,0.537776,0.082936
4,-0.7725,1.500282,0.797973,0.753864,1.115887,-0.730499,0.036682


In [66]:
# Let's do the same, but now let's reduce to 2 components
# Perform PCA
pca = PCA(n_components = 0.5)
df_pca = pca.fit_transform(df_std)

# The transformed data is an array, convert it back into a dataframe
df_pca = pd.DataFrame(df_pca)

# Print the explained variance ratio
print('Explained variance ratio:', pca.explained_variance_ratio_)

# Print the cumulative explained variance ratio
cumsum_variance = np.cumsum(pca.explained_variance_ratio_)
print('Cumulative explained variance ratio:', cumsum_variance)

# Show the first few rows of transformed dataframe
df_pca.head()

Explained variance ratio: [0.28857346 0.18072185 0.14675488]
Cumulative explained variance ratio: [0.28857346 0.46929531 0.61605019]


Unnamed: 0,0,1,2
0,-0.395268,-1.182948,1.791319
1,0.819082,-0.205962,2.084828
2,-1.171866,-0.469183,-0.324207
3,-1.106793,0.798769,2.060352
4,-0.7725,1.500282,0.797973


In [37]:
bikes = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/bikes_sharing.csv')

In [38]:
# Transform datetime to a datetime data format
bikes['datetime'] = pd.to_datetime(bikes['datetime'])
# Extract hour, month, day into new columns
bikes['hour'] = bikes['datetime'].dt.hour
bikes['month'] = bikes['datetime'].dt.month
bikes['day'] = bikes['datetime'].dt.day
bikes

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month,day
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,0,1,1
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,2,1,1
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,3,1,1
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,19,12,19
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,20,12,19
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,21,12,19
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,22,12,19


In [39]:
# Create new column with rental count range using total_cat() function created above
# Create function to classify amounts
def total_cat(x):
    if x >= 0 and x < 10:
        return '0-10'
    elif x >= 10 and x < 50:
        return '10-50'
    elif x >= 50 and x < 100:
        return '50-100'
    else:
        return '100+'
bikes['rental_total_group'] = bikes['count'].apply(total_cat)
bikes.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month,day,rental_total_group
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,0,1,1,10-50
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,1,10-50
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2,1,1,10-50
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,3,1,1,10-50
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,4,1,1,0-10


In [40]:
# Create function with 2 inputs - temp and humidity - to classify good/bad days
def good_bad(temp,hum):
    if temp > 25 and hum > 70:
        return 'too hot'
    elif temp <=25 and hum >= 50 and hum <= 70:
        return 'so so day'
    else:
        return 'good day'
# apply function
bikes['day_type'] = bikes.apply(lambda x: good_bad(x['temp'], x['humidity']), axis=1)

bikes.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month,day,rental_total_group,day_type
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,0,1,1,10-50,good day
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,1,10-50,good day
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2,1,1,10-50,good day
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,3,1,1,10-50,good day
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,4,1,1,0-10,good day


In [41]:
# Dummy variables - convert season to dummies; first - rename season

season_mapping = {1:'winter', 2:'spring', 3:'summer', 4:'fall'}
bikes['season'] = bikes['season'].map(season_mapping)

bikes.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month,day,rental_total_group,day_type
0,2011-01-01 00:00:00,winter,0,0,1,9.84,14.395,81,0.0,3,13,16,0,1,1,10-50,good day
1,2011-01-01 01:00:00,winter,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,1,10-50,good day
2,2011-01-01 02:00:00,winter,0,0,1,9.02,13.635,80,0.0,5,27,32,2,1,1,10-50,good day
3,2011-01-01 03:00:00,winter,0,0,1,9.84,14.395,75,0.0,3,10,13,3,1,1,10-50,good day
4,2011-01-01 04:00:00,winter,0,0,1,9.84,14.395,75,0.0,0,1,1,4,1,1,0-10,good day


In [48]:
# Create season dummies
season_dummies = pd.get_dummies(bikes['season'], dtype=int)
season_dummies.head()

Unnamed: 0,fall,spring,summer,winter
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [43]:
bikes = pd.concat([bikes,season_dummies], axis=1)
bikes.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,count,hour,month,day,rental_total_group,day_type,fall,spring,summer,winter
0,2011-01-01 00:00:00,winter,0,0,1,9.84,14.395,81,0.0,3,...,16,0,1,1,10-50,good day,False,False,False,True
1,2011-01-01 01:00:00,winter,0,0,1,9.02,13.635,80,0.0,8,...,40,1,1,1,10-50,good day,False,False,False,True
2,2011-01-01 02:00:00,winter,0,0,1,9.02,13.635,80,0.0,5,...,32,2,1,1,10-50,good day,False,False,False,True
3,2011-01-01 03:00:00,winter,0,0,1,9.84,14.395,75,0.0,3,...,13,3,1,1,10-50,good day,False,False,False,True
4,2011-01-01 04:00:00,winter,0,0,1,9.84,14.395,75,0.0,0,...,1,4,1,1,0-10,good day,False,False,False,True


In [52]:
df1 = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/module_5_feature_engineering_inclass.csv')

In [55]:
df1.drop('user_id', axis=1, inplace=True)


KeyError: "['user_id'] not found in axis"

In [57]:
df1.head()

Unnamed: 0,age,signup_date,last_purchase_date,total_purchases,total_spent,favorite_product_category,location,gender,email_domain
0,33,1978-11-01 17:03,2020-05-11 4:13,10,840.954993,books,Lake Gerald,female,gmail.com
1,32,2016-04-08 14:19,1959-01-07 5:45,2,1147.412095,books,North Justinburgh,male,yahoo.com
2,19,2021-03-08 7:01,2006-10-29 16:56,6,1301.284835,books,West Alec,female,yahoo.com
3,48,2006-01-11 11:04,1966-07-01 20:22,8,1404.261405,books,Annaton,male,hotmail.com
4,75,1985-06-16 1:41,2013-05-14 12:44,3,1968.752964,books,East Markburgh,female,yahoo.com
