# **Machine Learning with Python**

## Exercise 1.2: Ethics and Direction of Machine Learning Programs

### **Contents**

##### 1) Importing libraries and data set
##### 2) Scaling
##### 2.1) Split first
##### 2.2) Date column: remove it from both sets
##### 2.3) Missing values
##### 2.4) Scaling

#### **1) Importing libraries and data set**

In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [35]:
# Setting up the path

path = r'C:\Users\andd0\Documents\Machine Learning with Python'

In [36]:
# Importing the data set

df = pd.read_csv(os.path.join(path, 'Main folder', '02 Data sets', 'Original data set', 'Dataset-weather-prediction-dataset-processed.csv'), index_col = False)

In [37]:
df.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22950 entries, 0 to 22949
Columns: 170 entries, DATE to VALENTIA_temp_max
dtypes: float64(145), int64(25)
memory usage: 29.8 MB


#### **2) Scaling**

#### 2.1 Split first

In [39]:
X_train, X_test = train_test_split(df, test_size=0.25, random_state=42)

#### 2.2 Date column: remove it from both sets

In [40]:
# Dropping the 'DATE' and 'MONTH' columns

X_train_nums = X_train.drop(columns=['DATE', 'MONTH'])
X_test_nums = X_test.drop(columns=['DATE', 'MONTH'])

#### 2.3 Missing values

In [41]:
X_train_nums.isnull().sum()

BASEL_cloud_cover         0
BASEL_wind_speed          0
BASEL_humidity            0
BASEL_pressure            0
BASEL_global_radiation    0
                         ..
VALENTIA_snow_depth       0
VALENTIA_sunshine         0
VALENTIA_temp_mean        0
VALENTIA_temp_min         0
VALENTIA_temp_max         0
Length: 168, dtype: int64

In [42]:
X_test_nums.isnull().sum()

BASEL_cloud_cover         0
BASEL_wind_speed          0
BASEL_humidity            0
BASEL_pressure            0
BASEL_global_radiation    0
                         ..
VALENTIA_snow_depth       0
VALENTIA_sunshine         0
VALENTIA_temp_mean        0
VALENTIA_temp_min         0
VALENTIA_temp_max         0
Length: 168, dtype: int64

#### 2.4 Scaling

In [43]:
# Create scaler object using StandardScaler from sklearn.preprocessing
# StandardScaler assumes data is normally distributed and scales with a distribution around 0 and standard deviation of 1 (task requirements). 
# Scaling happens independently with each variable.
# Fit scaler on training numeric part and transform both

scaler = StandardScaler()

In [44]:
# Create a new df with scaled data for both the train and test sets

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_nums),
                              columns=X_train_nums.columns, index=X_train_nums.index)

X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test_nums),
                              columns=X_test_nums.columns, index=X_test_nums.index)

In [45]:
# Reattaching the date column
# This code allows me to place the 'DATE' column just after the index

date_train = X_train.pop('DATE')
X_train_scaled.insert(0, 'DATE', date_train)

date_test = X_test.pop('DATE')
X_test_scaled.insert(0, 'DATE', date_test)

In [46]:
# Train set

X_train_scaled.head()

Unnamed: 0,DATE,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
8196,19820610,-1.411077,-0.03095,-0.616603,-0.746131,1.181615,-0.321907,-0.178396,0.530475,1.352627,...,0.169208,0.059797,-0.56689,-0.008427,-0.14688,-0.024111,-0.411551,1.45475,1.575817,1.165121
20472,20160119,1.081862,-0.587893,0.557788,-0.214102,-0.82724,-0.262136,-0.178396,-1.069706,-1.652545,...,-0.443416,-0.080424,-0.007306,-1.066874,-0.003704,-0.024111,-0.023323,-0.013908,-0.007755,-0.009817
18269,20100107,0.666372,-0.866365,0.738464,-1.262958,-1.009863,-0.301983,0.821457,-0.93056,-1.652545,...,-0.443416,1.041345,0.403055,-0.831663,-0.422219,-0.024111,-0.08305,-2.921253,-3.229505,-2.187749
7713,19810212,-1.411077,-0.03095,-0.164914,0.409131,-0.590904,-0.441448,-0.178396,-0.25802,-1.598397,...,1.394457,1.181566,-1.033209,-1.113916,1.284882,-0.024111,-1.038689,-1.182841,-0.663026,-1.12744
21958,20200213,0.666372,3.171473,-0.616603,-0.685328,-0.82724,1.909538,-0.178396,-0.953751,-0.583137,...,-0.443416,-0.080424,-0.007306,-0.008427,-0.003704,-0.024111,-0.023323,-0.013908,-0.007755,-0.009817


In [47]:
# Test set

X_test_scaled.head()

Unnamed: 0,DATE,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
19955,20140820,0.226971,-0.794596,-0.717731,-0.190473,-0.204771,-0.458106,-0.181705,-0.100121,0.651945,...,-0.444564,-0.086364,0.006067,1.070282,-0.014069,-0.026412,-0.002957,0.012763,0.021267,0.011557
7729,19810228,1.05915,-0.019585,-0.085363,-0.781079,-1.159948,1.133485,-0.181705,-1.052227,-1.25446,...,-0.444564,0.197217,-2.787018,-0.517016,2.39553,-0.026412,-0.706192,-1.470068,-1.239606,-1.471135
10815,19890811,0.226971,-0.019585,-0.26604,-0.905417,0.653815,-0.458106,-0.181705,-0.193009,1.351417,...,0.786867,0.197217,-0.943582,1.176892,-0.563911,-0.026412,0.5474,1.798212,1.994807,1.348887
3125,19680722,-0.605208,-0.019585,-0.898407,-0.003965,1.362149,-0.458106,-0.181705,0.828763,0.69309,...,0.786867,0.62259,1.235024,1.093973,-0.677114,-0.026412,-0.033533,1.283761,1.117678,1.290742
18985,20111224,-0.189118,2.047111,0.366328,1.814479,-1.181413,-0.437701,-0.181705,-1.029004,-0.82929,...,-0.444564,-0.086364,0.006067,-0.990836,-0.014069,-0.026412,-0.002957,0.012763,0.021267,0.011557


In [48]:
# Exporting in CSV format

X_train_scaled.to_csv(os.path.join(path, 'Main folder', '02 Data sets', 'Prepared data set', 'X_train_scaled.csv'))