In [3]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# for the yeo-johnson transformation
import scipy.stats as stats

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# to save the trained scaler class
import joblib

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [4]:
# load dataset
data = pd.read_csv('seoul.csv')

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(960113, 26)


Unnamed: 0.1,Unnamed: 0,Duration,Distance,PLong,PLatd,DLong,DLatd,Haversine,Pmonth,Pday,Phour,Pmin,PDweek,Dmonth,Dday,Dhour,Dmin,DDweek,Temp,Precip,Wind,Humid,Solar,Snow,GroundTemp,Dust
0,14,11,1800,37.52565,126.887817,37.535961,126.8983,1.472768,1,1,0,5,0,1,1,0,17,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0
1,43,14,2280,37.554859,126.936157,37.549904,126.955147,1.762402,1,1,0,17,0,1,1,0,31,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0
2,49,13,2160,37.53186,127.067192,37.539654,127.052589,1.552109,1,1,0,17,0,1,1,0,32,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0
3,61,20,5250,37.484661,126.9039,37.507332,126.87973,3.301778,1,1,0,21,0,1,1,0,42,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0
4,65,3,630,37.479916,126.90284,37.483192,126.902031,0.371204,1,1,0,22,0,1,1,0,26,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0


In [10]:
#tmp = data.drop(['Unnamed: 0'],axis=1,inplace=True)
data.to_csv('train.csv',index=False)

In [5]:
data.isnull().sum()

Unnamed: 0    0
Duration      0
Distance      0
PLong         0
PLatd         0
DLong         0
DLatd         0
Haversine     0
Pmonth        0
Pday          0
Phour         0
Pmin          0
PDweek        0
Dmonth        0
Dday          0
Dhour         0
Dmin          0
DDweek        0
Temp          0
Precip        0
Wind          0
Humid         0
Solar         0
Snow          0
GroundTemp    0
Dust          0
dtype: int64

In [6]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Unnamed: 0', 'Duration'], axis=1), # predictive variables
    data['Duration'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((864101, 24), (96012, 24))

In [7]:
X_train.to_csv('train.csv', index=False)
X_test.to_csv('test.csv', index=False)


# Target

In [14]:
y_train = np.log(y_train)
y_test = np.log(y_test)

# Missing Values 
Since we don't have any missing values as if now. We will not do any transformations.
## But as of convention, perfom:
Categorical variables
Replace missing values with the string "missing" in those variables with a lot of missing data.

Alternatively, we will replace missing data with the most frequent category in those variables that contain fewer observations without values.

This is common practice.

In [15]:
X_train.isnull().sum()

Distance      0
PLong         0
PLatd         0
DLong         0
DLatd         0
Haversine     0
Pmonth        0
Pday          0
Phour         0
Pmin          0
PDweek        0
Dmonth        0
Dday          0
Dhour         0
Dmin          0
DDweek        0
Temp          0
Precip        0
Wind          0
Humid         0
Solar         0
Snow          0
GroundTemp    0
Dust          0
dtype: int64

# Feature Scaling
For use in linear models, features need to be either scaled. We will scale features to the minimum and maximum values:

In [39]:
# create scaler
scaler = MinMaxScaler()

#  fit  the scaler to the train set
scaler.fit(X_train) 

# transform the train and test set

# sklearn returns numpy arrays, so we wrap the
# array with a pandas dataframe

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [40]:
X_train.head()

Unnamed: 0,Distance,PLong,PLatd,DLong,DLatd,Haversine,Pmonth,Pday,Phour,Pmin,PDweek,Dmonth,Dday,Dhour,Dmin,DDweek,Temp,Precip,Wind,Humid,Solar,Snow,GroundTemp,Dust
0,0.123735,0.439752,0.690362,0.564855,0.743494,0.137613,0.909091,0.633333,0.782609,0.135593,0.166667,0.909091,0.633333,0.782609,0.440678,0.166667,0.47028,0.0,0.27027,0.238636,0.0,0.0,0.257256,0.151316
1,0.27063,0.536782,0.072524,0.390582,0.245192,0.248401,0.181818,0.5,0.043478,0.322034,0.666667,0.181818,0.5,0.086957,0.186441,0.666667,0.437063,0.0,0.283784,0.829545,0.0,0.0,0.282322,0.046053
2,0.041725,0.513858,0.536762,0.509323,0.570664,0.040074,0.727273,0.933333,0.565217,0.830508,0.833333,0.727273,0.933333,0.608696,0.016949,0.833333,0.746503,0.0,0.189189,0.340909,0.78125,0.0,0.693931,0.046053
3,0.072366,0.544863,0.312979,0.476247,0.32805,0.069557,0.818182,0.4,1.0,0.372881,0.833333,0.818182,0.4,1.0,0.559322,0.833333,0.503497,0.0,0.040541,0.590909,0.0,0.0,0.283641,0.059211
4,0.042026,0.138733,0.353218,0.163031,0.349159,0.024294,0.818182,1.0,0.347826,0.915254,0.333333,0.818182,1.0,0.391304,0.0,0.333333,0.375874,0.0,0.243243,0.647727,0.042614,0.0,0.208443,0.049342


In [41]:
X_train.to_csv('xtrain.csv', index=False)
X_test.to_csv('xtest.csv', index=False)

y_train.to_csv('ytrain.csv', index=False)
y_test.to_csv('ytest.csv', index=False)

  after removing the cwd from sys.path.
  """


In [42]:
# now let's save the scaler

joblib.dump(scaler, 'minmax_scaler.joblib') 

['minmax_scaler.joblib']

That concludes the feature engineering section.