In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# Load in data
df_crowdfunding = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_1/datasets/crowdfunding-data.csv')
df_crowdfunding

Unnamed: 0,goal,pledged,backers_count,country,staff_pick,spotlight,category,days_active,outcome
0,100,0,0,3,0,0,0,17,0
1,1400,14560,158,0,0,1,1,27,1
2,108400,142523,1425,4,0,0,2,20,1
3,4200,2477,24,0,0,0,1,40,0
4,7600,5265,53,0,0,0,3,4,0
...,...,...,...,...,...,...,...,...,...
1124,17130,15894,847,2,0,0,5,6,0
1125,97329,80937,862,6,0,0,3,29,0
1126,53597,40388,58,0,0,0,9,46,0
1127,71588,18102,274,0,0,0,2,43,0


In [3]:
# Define features set
# Drop the target to create the X data
X = df_crowdfunding.copy()
X.drop("outcome", axis=1, inplace=True)
X.head()

Unnamed: 0,goal,pledged,backers_count,country,staff_pick,spotlight,category,days_active
0,100,0,0,3,0,0,0,17
1,1400,14560,158,0,0,1,1,27
2,108400,142523,1425,4,0,0,2,20
3,4200,2477,24,0,0,0,1,40
4,7600,5265,53,0,0,0,3,4


In [4]:
# Define target vector
y = df_crowdfunding["outcome"].values.reshape(-1, 1)
y[:5]

array([[0],
       [1],
       [1],
       [0],
       [0]])

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [6]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-0.84657927, -0.67072939, -0.65649726, ...,  1.81505268,
         0.25646397, -0.46832735],
       [ 1.00836593,  0.93549949,  0.15641618, ...,  1.81505268,
        -0.44343448,  1.30235274],
       [-0.07571731,  1.8773598 ,  0.70230761, ..., -0.5509482 ,
         0.6064132 ,  0.26436786],
       ...,
       [ 0.83393731,  2.43663063,  4.96098984, ..., -0.5509482 ,
         1.48128628, -0.71255908],
       [-0.83345023, -0.64161745, -0.53073263, ...,  1.81505268,
         0.78138782,  0.08119406],
       [-0.83720139, -0.62016151, -0.56536347, ..., -0.5509482 ,
        -0.96835833,  1.36341068]])

In [7]:
# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-0.68152853, -0.55763112, -0.52344192, ..., -0.5509482 ,
        -0.6184091 ,  0.02013613],
       [-0.7959387 , -0.49320588, -0.52708727, ..., -0.5509482 ,
        -1.14333294,  0.93600514],
       [-0.7640539 , -0.60299292, -0.60363966, ..., -0.5509482 ,
        -0.26845987,  0.26436786],
       ...,
       [ 2.82955074,  2.28760658,  0.70139628, ...,  1.81505268,
        -0.6184091 ,  0.8749472 ],
       [ 2.76953229,  0.11303451, -0.17257681, ..., -0.5509482 ,
        -0.79338371, -1.56737016],
       [-0.7884364 , -0.52151394, -0.5589841 , ..., -0.5509482 ,
        -0.6184091 ,  1.36341068]])

In [8]:
# Check the max and min of the scaled training and testing sets
print("Scaled data min/max (StandardScaler):")
print("Training data min:",X_train_scaled.min())
print("Training data max:",X_train_scaled.max())
print("Testing data min:",X_test_scaled.min())
print("Testing data max:",X_test_scaled.max())

Scaled data min/max (StandardScaler):
Training data min: -1.6894860276041326
Training data max: 5.191558336924065
Testing data min: -1.6284280935038677
Testing data max: 5.9479688321568975


In [9]:
# Alternatively, scaling the data by using MinMaxScaler()
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[0.00252016, 0.02695997, 0.00742459, ..., 1.        , 0.34782609,
        0.33333333],
       [0.50100806, 0.44843554, 0.1453983 , ..., 1.        , 0.17391304,
        0.81666667],
       [0.20967742, 0.69558033, 0.23805104, ..., 0.        , 0.43478261,
        0.53333333],
       ...,
       [0.45413306, 0.84233338, 0.9608662 , ..., 0.        , 0.65217391,
        0.26666667],
       [0.00604839, 0.03459897, 0.0287703 , ..., 1.        , 0.47826087,
        0.48333333],
       [0.00504032, 0.04022902, 0.0228925 , ..., 0.        , 0.04347826,
        0.83333333]])

In [10]:
# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[0.046875  , 0.05663703, 0.03000773, ..., 0.        , 0.13043478,
        0.46666667],
       [0.01612903, 0.07354226, 0.02938902, ..., 0.        , 0.        ,
        0.71666667],
       [0.02469758, 0.04473407, 0.01639598, ..., 0.        , 0.2173913 ,
        0.53333333],
       ...,
       [0.99042339, 0.80322937, 0.23789637, ..., 1.        , 0.13043478,
        0.7       ],
       [0.97429435, 0.23262016, 0.08955916, ..., 0.        , 0.08695652,
        0.03333333],
       [0.01814516, 0.06611421, 0.02397525, ..., 0.        , 0.13043478,
        0.83333333]])

In [11]:
# Check the max and min of the scaled training and testing sets
print("Scaled data min/max (MinMaxScaler):")
print("Training data min:",X_train_scaled.min())
print("Training data max:",X_train_scaled.max())
print("Testing data min:",X_test_scaled.min())
print("Testing data max:",X_test_scaled.max())

Scaled data min/max (MinMaxScaler):
Training data min: 0.0
Training data max: 1.0
Testing data min: 0.0
Testing data max: 1.128383604021655
