# Auto MPG Data Set 

In [60]:
############################## Load Libraries #####################################
import numpy
from pandas import read_csv
from numpy import array
from numpy import arange
from pandas.tools.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error


############################## Import Dataset #####################################

dataset  = read_csv('C:/Users/Satish/python_files/auto.csv')
# names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model', 'year', 'origin','car name']
############################## Analyse the Data #####################################
# print(dataset.head(10))
# print(dataset.describe())
# print(dataset.shape)
# print(dataset.isnull().sum())
# print((dataset[['mpg', 'cylinders', 'displacement', 'horsepower', 'weight','acceleration', 'origin']]==0))
# print(dataset.columns)
# print(dataset.dtypes)

############################## Data Visualize #####################################

# dataset.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
# dataset.plot(kind='box', layout=(4,4), subplots=True, sharex=False, fontsize=1)
# dataset.plot(kind='density', layout=(4,4), subplots=True, sharex=False, fontsize=1)
# scatter_matrix(dataset)
# pyplot.show()   
### Attribute mpg, accelaration, weight may have bionomial distribution  

################### correlation matrix ######################
# fig = pyplot.figure()
# ax = fig.add_subplot(111)
# cax = ax.matshow(dataset.corr(), vmin=-1, vmax=1, interpolation='none')
# fig.colorbar(cax)
# ticks = numpy.arange(0,14,1)
# ax.set_xticks(ticks)
# ax.set_yticks(ticks)
# ax.set_xticklabels(names)
# ax.set_yticklabels(names)
# pyplot.show

### Dark yellow region shows positive correclation and dark blue region shows negative correlation
### by looking at the matrix we can say that there is heavy positive and negative correlation
    
############################## Create train and test Dataset #####################################    
data = dataset.values
# print(data[:10,:])
x = data[:,1:8]
# print(x[:10,:])
y = data[:,0]
# print(y[:10])

test_size = 0.3
num_folds = 10
seed = 27
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=test_size, random_state=seed)
scoring = 'neg_mean_squared_error'

############################## Build the model ###################################################
models = []
models.append(('LR', LinearRegression()))
models.append(('LS', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('SVM', SVR()))

############################## Execute the model ###################################################
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
#     print(name, cv_results.mean(), cv_results.std())
### LR:-12.62 and LS:-12.87 has lower MSE than others

############################## Compare the model ###################################################

# fig = pyplot.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# pyplot.boxplot(results)
# ax.set_xticklabels(names)
# pyplot.show()
# ### From above plot we can say that LR and LS show lower MSE than others

############################## Standardize the dataset and Build the model #############################
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])))
pipelines.append(('ScaledLS', Pipeline([('Scaler', StandardScaler()), ('LS', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])))

results = []
names = []
# for name, model in pipelines:
#     kfold = KFold(n_splits=num_folds, random_state=seed)
#     cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     print(name, cv_results.mean(), cv_results.std())
### SCaling has show its effect on KNN(-10.92) by bringing down the error

############################## Compare the Scaled model ####################################### 

# fig = pyplot.figure()
# fig.suptitle('Scaled Algorothm Comparison')
# ax = fig.add_subplot(111)
# pyplot.boxplot(results)
# ax.set_xticklabels(names)
# pyplot.show()
### KNN  has lowest MSE than others

############################## Improve result with Tuning #######################################
scaler = StandardScaler().fit(x_train)
rescaledx = scaler.transform(x_train)
# k_values = numpy.array([1,3,5,7,9,11,13,15,17,19,21])
param_grid = dict(n_neighbors = k_values)
model = KNeighborsRegressor()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid  = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_results = grid.fit(rescaledx, y_train)
# print("Best Score: ", grid_results.best_score_, "Best param: ", grid_results.best_params_)
mean = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
# for mean, std, param in zip(mean, stds, params):
#     print(mean, std, param)
#### Tunning has improves KNN (-10.32) score slightly   

############################## Improve the performanc using ensemble methods #############################
ensembles = []
ensembles.append(('ScaledAB', Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())])))
ensembles.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor())])))
ensembles.append(('ScaledRM', Pipeline([('Scaler', StandardScaler()), ('RM', RandomForestRegressor())])))
ensembles.append(('ScaledET', Pipeline([('Scaler', StandardScaler()), ('ET', ExtraTreesRegressor())])))

results = []
names = []
# for name, model in ensembles:
#     kfold = KFold(n_splits=num_folds, random_state=seed)
#     cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     print(name, cv_results.mean(), cv_results.std())
### Gradient Boosting has improve the performance (-9.68)


# fig = pyplot.figure()
# fig.suptitle('Scaled Ensemble Algorithm Comparison')
# ax = fig.add_subplot(111)
# pyplot.boxplot(results)
# ax.set_xticklabels(names)
# pyplot.show() 
### It looks like Gradient Boosting has a better mean score.

############################## Tune Ensemble Mothods #######################################

scaler = StandardScaler().fit(x_train)
rescaledx = scaler.transform(x_train)
param_grid = dict(n_estimators = numpy.array([50,100, 150, 200, 250, 300, 350, 400]))
model = GradientBoostingRegressor(random_state=seed)
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(rescaledx, y_train)
# print("Best Scor: ", grid_result.best_score_, "Using param: ", grid_result.best_params_)
mean = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
# for mean, std, param in zip(mean, stds, params):
#     print(mean, std, param)
#### Tunning has slightly improves GB ensemble methods score (-9.53) using n_estimators = 100

############################## Finalize the model #######################################

scaler = StandardScaler().fit(x_train)
rescaledx = scaler.transform(x_train)
model = GradientBoostingRegressor(n_estimators=100, random_state=seed)
model.fit(rescaledx, y_train)

resclaledtest_x = scaler.transform(x_test)
prediction = model.predict(resclaledtest_x)
print(mean_squared_error(y_test, prediction))

######### mean_squared_error : 5.93



5.936605343993929


