# 8 Steps Of Machine Learning
* Data Gathering
* Data pre-processing
* Feature Engineering
* Choosing Model
* Training Model
* Test Model/ Model Evaluation
* Parameter Tuning
* Prediction


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Linear Regression

In [None]:
# Data Gathering
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model
from sklearn import model_selection
import seaborn as sns
df = pd.read_csv("FuelConsumption.csv")
df.head()
df.describe()

#data Pre-Processing
df.info()
data = df[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_CITY",
           "FUELCONSUMPTION_HWY", "FUELCONSUMPTION_COMB", "CO2EMISSIONS"]]
data.info()

## Feature Engineering Analyssis
corr = data.corr()
sns.heatmap(corr)
sns.pairplot(data)
data = df[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_CITY",
           "FUELCONSUMPTION_HWY", "FUELCONSUMPTION_COMB", "CO2EMISSIONS"]]

## Split data into training and test set
X = data[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_COMB"]]
Y = data[["CO2EMISSIONS"]]
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(
    X, Y, test_size=0.2, random_state=1)

## Choosing Model

#Init
model = linear_model.LinearRegression()

## Train Model
model.fit(xtrain, ytrain)
print(model.score(xtrain, ytrain))

## Predict/Test
yhat = model.predict(xtest)

## Metrics Evaluation
print("MSE: ", mean_squared_error(yhat, ytest))
print("R2 Score: ", r2_score(yhat, ytest))
print(xtrain.shape, ytrain.shape)


# Regression Analyssis
line1 = np.linspace(1, 8.4, 853).reshape(-1, 1)
line2 = np.linspace(1, 8.4, 853).reshape(-1, 1)
line3 = np.linspace(1, 8.4, 853).reshape(-1, 1)
lines = pd.DataFrame(
    {
        "E": line1[:, 0],
        "C": line2[:, 0],
        "F": line3[:, 0]
    }
)

# fig, axes = plt.subplots(1, 1, figsize=(10, 6))
# axes.scatter(xtrain[["ENGINESIZE"]], ytrain, label="Actual Data", c="hotpink")
# axes.plot(lines[["E"]].values, model.predict(lines), label="Regression Line", c="darkorange")
# axes.legend()
# axes.set_xlabel("EngineSize")
# axes.set_ylabel("CO2Emission")
# plt.show()


In [None]:
## KFold Cross Validation
from sklearn.model_selection import KFold

folds = KFold(n_splits=5)
accuracies = []
fig, axes = plt.subplots(1, 5, figsize=(25, 6))
for Index, axis in zip(folds.split(X), axes):
    ##Training data
    xtrain = X.iloc[Index[0]]
    ytrain = Y.iloc[Index[0]]

    ## Testing
    xtest = X.iloc[Index[1]]
    ytest = Y.iloc[Index[1]]

    ## Model
    model = linear_model.LinearRegression()
    
    ## Train Model
    model.fit(xtrain, ytrain)

    ## Test Prediction
    ypred = model.predict(xtest)

    ## Evaluation Metrices
    print("================== Fold ================")
    print("MSE: ", mean_squared_error(ypred, ytest))
    print("R2 Score: ", r2_score(ytest, ypred))

    accuracies.append(r2_score(ytest, ypred))

    ## Regression Analyssis
#     line = np.linspace(1, 8.4, 853).reshape(-1, 1)
#     axis.scatter(xtrain, ytrain, label="Actual Data", c="hotpink")
#     axis.plot(line, model.predict(line), label="Regression Line", c="black", linewidth=5)
#     axis.legend()
#     axis.set_xlabel("EngineSize")
#     axis.set_ylabel("CO2Emission")
# plt.show()
    
sum(accuracies)/len(accuracies)


In [None]:
## Cross val score
from sklearn import model_selection
model = linear_model.LinearRegression()

# metrics.SCORERS.keys()
cv = model_selection.cross_val_score(
    model,
    X,Y,
    cv=5,
    scoring='r2', verbose=4
)
cv.mean()


In [None]:
list(zip([1,2,3], ['a','b','c']))
line1[:, 0]
