<a href="https://colab.research.google.com/github/DonAkolab/85DaysOfAIprojects/blob/master/85DaysOfAIprojects_Day2_Cleaning_and_Splitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleaning and Splitting

$ python -c "import sklearn; print sklearn.__version__"
0.18.1

In [0]:
## importing neccessary library

import numpy as np                                            #provides support for more efficient numerical computation
import pandas as pd                                           #a convenient library that supports dataframes

from sklearn import preprocessing                             #preoprocessing module
from sklearn.model_selection import train_test_split          #sampling helper



#A "family" of models are broad types of models, such as random forests, SVM's, linear regression models


from sklearn.ensemble import RandomForestRegressor             #import the random forest family


from sklearn.pipeline import make_pipeline                      # Import cross-validation pipelinePython
from sklearn.model_selection import GridSearchCV  


from sklearn.metrics import mean_squared_error, r2_score         #Import evaluation metricsPython
from sklearn.externals import joblib                             #Import module for saving scikit-learn modelsPython


In [0]:
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [4]:
print (data.head())

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [5]:
# Read CSV with semicolon separatorPython
data = pd.read_csv(dataset_url, sep=';')
 
print (data.head())

   fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0            7.4              0.70         0.00  ...       0.56      9.4        5
1            7.8              0.88         0.00  ...       0.68      9.8        5
2            7.8              0.76         0.04  ...       0.65      9.8        5
3           11.2              0.28         0.56  ...       0.58      9.8        6
4            7.4              0.70         0.00  ...       0.56      9.4        5

[5 rows x 12 columns]


In [6]:
print (data.shape)

(1599, 12)


In [7]:
print (data.describe())

       fixed acidity  volatile acidity  ...      alcohol      quality
count    1599.000000       1599.000000  ...  1599.000000  1599.000000
mean        8.319637          0.527821  ...    10.422983     5.636023
std         1.741096          0.179060  ...     1.065668     0.807569
min         4.600000          0.120000  ...     8.400000     3.000000
25%         7.100000          0.390000  ...     9.500000     5.000000
50%         7.900000          0.520000  ...    10.200000     6.000000
75%         9.200000          0.640000  ...    11.100000     6.000000
max        15.900000          1.580000  ...    14.900000     8.000000

[8 rows x 12 columns]


In [0]:
# Separate target from training features Python
y = data.quality
X = data.drop('quality', axis=1)

In [9]:
print (y.head())

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64


In [0]:
print (X.head())

   fixed acidity  volatile acidity  citric acid  ...    pH  sulphates  alcohol
0            7.4              0.70         0.00  ...  3.51       0.56      9.4
1            7.8              0.88         0.00  ...  3.20       0.68      9.8
2            7.8              0.76         0.04  ...  3.26       0.65      9.8
3           11.2              0.28         0.56  ...  3.16       0.58      9.8
4            7.4              0.70         0.00  ...  3.51       0.56      9.4

[5 rows x 11 columns]


In [0]:
# Split data into train and test setsPython
# random_state : However, if you use a particular value for random_state(random_state = 1 or any other value) everytime the result will be same,i.e, same values in train and test datasets
# stratify : array-like or None (default=None) If not None, data is split in a stratified fashion, using this as the class labels.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123, stratify=y)  

random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.



In [0]:
# Standardization is the process of subtracting the means from each feature and then dividing by the feature standard deviations.
# Standardization is a common requirement for machine learning tasks. 
# Many algorithms assume that all features are centered around zero and have approximately the same variance.

# Lazy way of scaling dataPython
X_train_scaled = preprocessing.scale(X_train)
print (X_trained_scaled)

print (X_train_scaled.mean(axis=0))

 
print (X_train_scaled.std(axis=0))

# but why did we say that we won't use this code?
# The reason is that we won't be able to perform the exact same transformation on the test set.
# Sure, we can still scale the test set separately, but we won't be using the same means and standard deviations as we used to transform the training set.
# In other words, that means it wouldn't be a fair representation of how the model pipeline, include the preprocessing steps, would perform on brand new data.
# Now, here's the preprocessing code we will use...
# So instead of directly invoking the scale function, we'll be using a feature in Scikit-Learn called the Transformer API. The Transformer API allows you to "fit" a preprocessing step using the training data the same way you'd fit a model...
# ...and then use the same transformation on future data sets!
# Here's what that process looks like:
# Fit the transformer on the training set (saving the means and standard deviations)
# Apply the transformer to the training set (scaling the training data)
# Apply the transformer to the test set (using the same means and standard deviations)
# This makes your final estimate of model performance more realistic, and it allows to insert your preprocessing steps into a cross-validation pipeline

In [0]:
# Here's how you do it: Fitting the Transformer APIPython
scaler = preprocessing.StandardScaler().fit(X_train)

# Now, the scaler object has the saved means and standard deviations for each feature in the training set.


In [0]:
#Let's confirm that worked:

# Applying transformer to training dataPython
X_train_scaled = scaler.transform(X_train)

In [15]:
print (X_train_scaled.mean(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]


In [16]:
print (X_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [0]:
#Note how we're taking the scaler object and using it to transform the training set. 
#Later, we can transform the test set using the exact same means and standard deviations used to transform the training set:
#Applying transformer to test dataPython

X_test_scaled = scaler.transform(X_test)

In [18]:
print (X_test_scaled.mean(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]


In [19]:
print (X_test_scaled.std(axis=0))

[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


Notice how the scaled features in the test set are not perfectly centered at zero with unit variance! This is exactly what we'd expect, as we're transforming the test set using the means from the training set, not from the test set itself.
In practice, when we set up the cross-validation pipeline, we won't even need to manually fit the Transformer API. 
Instead, we'll simply declare the class object, like so:

In [0]:
#Pipeline with preprocessing and modelPython
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))

#This is exactly what it looks like: a modeling pipeline that first transforms the data using StandardScaler() 
#and then fits a model using a random forest regressor.

There are two types of parameters we need to worry about: model parameters and hyperparameters. 
Models parameters can be learned directly from the data (i.e. regression coefficients), while hyperparameters cannot
Hyperparameters express "higher-level" structural information about the model, and they are typically set before training the model.
Within each decision tree, the computer can empirically decide where to create branches based on either mean-squared-error (MSE) or mean-absolute-error (MAE). Therefore, the actual branch locations are model parameters.

However, the algorithm does not know which of the two criteria, MSE or MAE, that it should use. The algorithm also cannot decide how many trees to include in the forest. These are examples of hyperparameters that the user must set.

We can list the tunable hyperparameters like so:


In [21]:
#List tunable hyperparametersPython
print (pipeline.get_params())


# ...
# 'randomforestregressor__criterion': 'mse',
# 'randomforestregressor__max_depth': None,
# 'randomforestregressor__max_features': 'auto',
# 'randomforestregressor__max_leaf_nodes': None,

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False))], 'verbose': False, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
           

In [0]:
#let's declare the hyperparameters we want to tune through cross-validation.

#Declare hyperparameters to tunePython
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                    'randomforestregressor__max_depth': [None, 5, 3, 1]}

Cross-validation is a process for reliably estimating the performance of a method for building a model by training and evaluating your model multiple times using the same method.These are the steps for CV:

Split your data into k equal parts, or "folds" (typically k=10).
Train your model on k-1 folds (e.g. the first 9 folds).
Evaluate it on the remaining "hold-out" fold (e.g. the 10th fold).
Perform steps (2) and (3) k times, each time holding out a different fold.
Aggregate the performance across all k folds. This is your performance metric

Here's how the CV pipeline looks after including preprocessing steps:

Split your data into k equal parts, or "folds" (typically k=10).
Preprocess k-1 training folds.
Train your model on the same k-1 folds.
Preprocess the hold-out fold using the same transformations from step (2).
Evaluate your model on the same hold-out fold.
Perform steps (2) - (5) k times, each time holding out a different fold.
Aggregate the performance across all k folds. This is your performance metric.
Fortunately, Scikit-Learn makes it stupidly simple to set this up:

In [0]:
#Sklearn cross-validation with pipelinePython
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

In [24]:
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

GridSearchCV essentially performs cross-validation across the entire "grid" (all possible permutations) of hyperparameters.
It takes in your model (in this case, we're using a model pipeline), the hyperparameters you want to tune, and the number of folds to create
you can see the best set of parameters found using CV:

In [25]:
print clf.best_params_
# {'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto'}

SyntaxError: ignored

After you've tuned your hyperparameters appropriately using cross-validation, you can generally get a small performance improvement by refitting the model on the entire training set.
Conveniently, GridSearchCV from sklearn will automatically refit the model with the best set of hyperparameters using the entire training set.

This functionality is ON by default, but you can confirm it:

In [0]:
#Confirm model will be retrainedPython
print clf.refit
# True

In [0]:
#This step is really straightforward once you understand that the  clf object you used to tune the hyperparameters can also be used directly like a model object.

#Here's how to predict a new set of data:
#Predict a new set of dataPython
y_pred = clf.predict(X_test)

In [27]:
print r2_score(y_test, y_pred)
# 0.45044082571584243
 
print mean_squared_error(y_test, y_pred)
# 0.35461593750000003

SyntaxError: ignored

In [28]:
#let's save your hard work so you can use the model in the future. It's really easy to do so:

#Save model to a .pkl filePython
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [29]:

#When you want to load the model again, simply use this function:

Load model from .pkl filePython
clf2 = joblib.load('rf_regressor.pkl')

SyntaxError: ignored

In [0]:
# Predict data set using loaded model
clf2.predict(X_test)

## The complete code, from start to finish.
Here's all the code in one place, in a single script.


### 2. Import libraries and modules
import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
 
### 3. Load red wine data.
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url, sep=';')
 
### 4. Split data into training and test sets
y = data.quality
X = data.drop('quality', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)
 
### 5. Declare data preprocessing steps
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))
 
### 6. Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}
 
### 7. Tune model using cross-validation pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
clf.fit(X_train, y_train)
 
### 8. Refit on the entire training set
 No additional code needed if clf.refit == True (default is True)
 
### 9. Evaluate model pipeline on test data
pred = clf.predict(X_test)
print r2_score(y_test, pred)
print mean_squared_error(y_test, pred)
 
### 10. Save model for future use
joblib.dump(clf, 'rf_regressor.pkl')

### To load: 
clf2 = joblib.load('rf_regressor.pkl')

#References

https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
