<a href="https://colab.research.google.com/github/Anoop-mishra63/ML/blob/master/Lab/Exp_5_J063.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Regression and Classification

Aim: Regression and Classification using Linear models


Objectives:

Perform regression on fuel consumption dataset

Perform classification on iris dataset

Select linear model for high accuracy on cross validation and test set for iris dataset

Select linear model for high R2_score [~1] on cross validation and test set for fuel consumption dataset

Perform grid search and random search


In [1]:
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from keras.optimizers import Adam
 


In [2]:
iris = load_iris()

In [3]:
iris['target']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [4]:
iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

# Step 1- Splitting the dataset

In [5]:
x = iris.data
y = iris.target

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2)

In [8]:
x_train.shape

(120, 4)

In [9]:
y_train

array([2, 1, 1, 1, 1, 1, 2, 2, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 0, 0, 2,
       0, 0, 2, 2, 2, 0, 1, 0, 2, 0, 0, 0, 2, 1, 0, 2, 0, 2, 1, 2, 2, 1,
       1, 0, 1, 1, 0, 0, 0, 2, 1, 0, 0, 2, 1, 2, 1, 1, 0, 1, 1, 2, 1, 2,
       0, 0, 2, 1, 0, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 0, 0, 0, 2, 0, 0, 1,
       1, 0, 2, 2, 1, 2, 0, 1, 1, 2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 2, 2, 2,
       2, 1, 1, 2, 0, 1, 1, 0, 0, 0])

# Step 3- Standardizing the dataset

In [10]:
from sklearn import preprocessing

In [11]:
x_train = preprocessing.scale(x_train)
x_test = preprocessing.scale(x_test)

# Step 2- Label Encoding

In [12]:
enc = preprocessing.LabelEncoder()

In [13]:
enc.fit(y_test)

LabelEncoder()

In [14]:
enc.fit_transform(y_train)

array([2, 1, 1, 1, 1, 1, 2, 2, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 0, 0, 2,
       0, 0, 2, 2, 2, 0, 1, 0, 2, 0, 0, 0, 2, 1, 0, 2, 0, 2, 1, 2, 2, 1,
       1, 0, 1, 1, 0, 0, 0, 2, 1, 0, 0, 2, 1, 2, 1, 1, 0, 1, 1, 2, 1, 2,
       0, 0, 2, 1, 0, 0, 2, 1, 2, 2, 0, 2, 1, 2, 0, 0, 0, 0, 2, 0, 0, 1,
       1, 0, 2, 2, 1, 2, 0, 1, 1, 2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 2, 2, 2,
       2, 1, 1, 2, 0, 1, 1, 0, 0, 0])

# Step 4- Choosing the classification algo

In [15]:
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import numpy as np

In [16]:
# define models
logistic = linear_model.LogisticRegression(solver='liblinear')
sgd = linear_model.SGDClassifier()

In [17]:
model = [logistic, sgd]

In [18]:
def get_cv_scores(model):
    scores = cross_val_score(model, x_train, y_train, cv=3, scoring='accuracy')
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))
    print('\n')

In [19]:
for m in model:
    print(m)
    get_cv_scores(m)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
CV Mean:  0.8833333333333333
STD:  0.05137011669140813


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
CV Mean:  0.8833333333333333
STD:  0.07168604389202186




In [20]:
penalty = ['l1', 'l2']
C = [0.0001, 0.001, 0.01]
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}]
solver = ['liblinear', 'saga']

param_grid = dict(penalty=penalty,
                  C=C,
                  class_weight=class_weight,
                  solver=solver)

grid = GridSearchCV(estimator=logistic, param_grid=param_grid, scoring='accuracy', verbose=1, n_jobs=-1)
grid_result = grid.fit(x_train, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Best Score:  0.7333333333333333
Best Params:  {'C': 0.0001, 'class_weight': {1: 0.5, 0: 0.5}, 'penalty': 'l2', 'solver': 'liblinear'}


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    1.2s finished


In [21]:
logistic = linear_model.LogisticRegression(C=0.0001, class_weight={1:0.5, 0:0.5}, penalty='l2', solver='liblinear')
get_cv_scores(logistic)

CV Mean:  0.7333333333333334
STD:  0.023570226039551605




In [22]:
from sklearn.metrics import accuracy_score

logistic.fit(x_train, y_train)

y_train_pred = logistic.predict(x_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print("Accuracy: %.2f%%" % (accuracy_train))


y_test_pred = logistic.predict(x_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
print("Accuracy: %.2f%%" % (accuracy_test))

Accuracy: 0.73%
Accuracy: 0.83%


SGD

In [23]:
penalty = ['l1','l2']
tol = []