# <center>Simple Linear Regression</center>
## <center>by sklearn Lib</center>

In [None]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline

### Reading the Fuel Dataset

In [None]:
data = pd.read_csv("data/FuelConsumption.csv")
data.head()

### Exploring Data

In [None]:
data.describe() 

In [None]:
viz = data[['ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB', 'CO2EMISSIONS']]
viz.hist()
plt.show()

#### Finding the best Linear Independent Variable (X) for dependent (Y) as CO2EMISSION

In [None]:
plt.scatter(data.ENGINESIZE, data.CO2EMISSIONS, c='b')
plt.xlabel('ENGINESIZE')
plt.ylabel('CO2EMISSION')
plt.show()

In [None]:
plt.scatter(data.CYLINDERS, data.CO2EMISSIONS, c='b')
plt.xlabel('CYLINDERS')
plt.ylabel('CO2EMISSION')
plt.show()

In [None]:
plt.scatter(data.FUELCONSUMPTION_CITY, data.CO2EMISSIONS, c='b')
plt.xlabel('FUELCONSUMPTION_CITY')
plt.ylabel('CO2EMISSION')
plt.show()

In [None]:
plt.scatter(data.FUELCONSUMPTION_HWY, data.CO2EMISSIONS, c='b')
plt.xlabel('FUELCONSUMPTION_HWY')
plt.ylabel('CO2EMISSION')
plt.show()

In [None]:
plt.scatter(data.FUELCONSUMPTION_COMB, data.CO2EMISSIONS, c='b')
plt.xlabel('FUELCONSUMPTION_COMB')
plt.ylabel('CO2EMISSION')
plt.show()

The Best among the independent variables, we find ENGINESIZE as the most suitable for Simple Linear Regression.

In [None]:
x = 'ENGINESIZE'
y = 'CO2EMISSIONS'

### Creating Test and Train Data 80:20 split

In [None]:
mask = np.random.rand(len(data)) < 0.8
train_data = data[mask]
test_data = data[~mask]

train_x = np.asanyarray(train_data[[x]])
train_y = np.asanyarray(train_data[[y]])

test_x = np.asanyarray(test_data[[x]])
test_y = np.asanyarray(test_data[[y]])

### Training the Simple Linear Model with sklearn module

In [None]:
mySimpleLR = linear_model.LinearRegression()
mySimpleLR.fit(train_x, train_y)
Coefficient = mySimpleLR.coef_[0][0]
Intercept = mySimpleLR.intercept_[0]
print("Coefficient:\t", Coefficient)
print("Intercept:\t", Intercept)

### Plotting the Linear Model

In [None]:
plt.scatter(train_x, train_y, c='b')
plt.plot(test_x, Intercept + Coefficient * test_x, c='r')
plt.xlabel(x)
plt.ylabel(y)
plt.show()

### Predicting the dependent Variable (y) : CO2EMISSIONS

In [None]:
predicted_y = mySimpleLR.predict(test_data[[x]])


from sklearn.metrics import r2_score
print("Mean Absolute Error:\t\t%0.2f" % np.mean(np.absolute(predicted_y - test_y)))
print("Residual Mean Square Error:\t%0.2f" % np.mean((predicted_y - test_y) ** 2))
print("R2-Score:\t\t\t%0.17f" % r2_score(test_y, predicted_y))