# DATA SCIENCE WORKSHOP WITH PYTHON, DPAM, PIEAS, JAN 11-13,2022
#### AIBUTT@UALBERTA.CA
#### We use scikit-learn to implement simple Linear Regression
#### We create a model, train it, test it and use the model

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
#%matplotlib inline

In [None]:
# Open dataset in csv format using pandas
df = pd.read_csv("FuelConsumptionCo2.csv")

In [None]:
# Take a look at the dataset
df.head()

In [None]:
# Summarize the data
df.describe()

In [None]:
# Let's work with subset of dataset
cdf = df[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB','CO2EMISSIONS']]
cdf.head(9)

In [None]:
# Let's visualize the dataset
viz = cdf[['CYLINDERS','ENGINESIZE','CO2EMISSIONS','FUELCONSUMPTION_COMB']]
viz.hist()
plt.show()

In [None]:
# Let's plot all three features against the emission to see how linear their relationship is!
plt.scatter(cdf.FUELCONSUMPTION_COMB, cdf.CO2EMISSIONS,  color='blue')
plt.xlabel("FUELCONSUMPTION_COMB")
plt.ylabel("Emission")
plt.show()

In [None]:
# Let's visualize train-dataset
plt.scatter(cdf.ENGINESIZE, cdf.CO2EMISSIONS,  color='blue')
plt.xlabel("Engine size")
plt.ylabel("Emission")
plt.show()

In [None]:
# TRAIN-TEST SPLIT SCHEME
# Helps in better evaluation of Out-of-Sample Accuracy
# Create a 80-20 mask to select random rows
msk = np.random.rand(len(df)) < 0.8; # print(msk); print(~msk);
train = cdf[msk]
test = cdf[~msk]

In [None]:
# SIMPLE REGRESSION MODEL: Minimize the Residual Sum of Squares using Linear Approximation
from sklearn import linear_model
regr = linear_model.LinearRegression()
train_x = np.asanyarray(train[['ENGINESIZE']])
train_y = np.asanyarray(train[['CO2EMISSIONS']])
regr.fit(train_x, train_y)
print("COEFFICIENT = %10f AND INTERCEPT = %10f" %(regr.coef_, regr.intercept_))

In [None]:
# Let's plot Fit-Line over data
plt.scatter(train.ENGINESIZE, train.CO2EMISSIONS,  color='blue')
plt.plot(train_x, regr.coef_*train_x + regr.intercept_, '-r')
plt.xlabel("Engine size")
plt.ylabel("Emission")

In [None]:
# DIFFERENT MODEL EVALUATION METRICS
# Mean Absolute Error
# Mean Squared Error
# Root Mean Squared Error
# R-Squared (not an error but a popular metric to measure the performance of a regression model)

test_x = np.asanyarray(test[['ENGINESIZE']])
test_y = np.asanyarray(test[['CO2EMISSIONS']])
test_y_ = regr.predict(test_x)

print("Mean Absolute Error: %.2f" % np.mean(np.absolute(test_y_ - test_y)))
print("Residual Sum of Squares (MSE): %.2f" % np.mean((test_y_ - test_y) ** 2))

In [None]:
from sklearn.metrics import r2_score
print("R2-Score: %.2f" % r2_score(test_y , test_y_) )

In [None]:
# DRILL THE WHOLE EXERCISE WITH FUELCONSUMPTION_COMB
train_x = train[["FUELCONSUMPTION_COMB"]]
test_x = test[["FUELCONSUMPTION_COMB"]]

regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)

predictions = regr.predict(test_x)

print("Mean Absolute Error: %.2f" % np.mean(np.absolute(predictions - test_y)))
print("Residual Sum of Squares (MSE): %.2f" % np.mean((predictions - test_y) ** 2))
print("R2-Score: %.2f" % r2_score(test_y , predictions))