# Linear Regression Machine Learning Example
## Build a ML Model to predict car prices based on car features

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Loading the data
car_data = pd.read_csv('car_data.csv')

# look at some of the data
car_data.head()

# Get information about the dataset 

In [None]:
car_data.info() 

## What we know
- The dataset has 301 rows and 9 columns and there are no null values in it. 
-  Dtype 'object' means text data
- The output ‘Selling_Price’ is the target, and there are multiple independent variables that affect this value. 
- This is a type of supervised Machine Learning problem where the output variables are labeled and the model is first trained on split data. 
- The model is then verified for its accuracy on validation/test data. 

# Replace text descriptions with numbers

In [None]:
# encoding Columns - replace text with numbers
car_data.replace({'Fuel_Type':{'Petrol':0,'Diesel':1,'CNG':2}},inplace=True) 
car_data.replace({'Seller_Type':{'Dealer':0,'Individual':1}},inplace=True) 
car_data.replace({'Transmission':{'Manual':0,'Automatic':1}},inplace=True) 

# display revised data frame
car_data.head()

# Check how data correlates
To understand the relationship between different attributes in the dataset, we plot a correlation matrix.
- this information may be used to remove highly correlated data in order to make machine learning more efficient

In [None]:
corrMatrix = car_data.corr() 
sns.heatmap(corrMatrix, annot=True, cmap='viridis') 
plt.show() 


# Drop columns from our data set
- Car_name has no relevance
- Selling_Price is what we are trying to predict

In [None]:
X = car_data.drop(['Car_Name','Selling_Price'],axis=1) 
X.head(7)

In [None]:
# Create a separate 1 column matrix for the prices
Y = car_data['Selling_Price'] 
Y.head(7)

# Split the data set into two parts
- training (usually 80%)
- testing (usually 20%)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=42) 

In [None]:
# create a linear regression model (using the constructor)
lin_reg_model = LinearRegression() 

# Now we can fit the model to our dataset 
lin_reg_model.fit(X_train,Y_train)

print ("Linear Regression Model created")


# Generate prediction based on Training data
## Here we test how accurate we are over the training (not test) data

In [None]:
training_data_prediction = lin_reg_model.predict(X_train) 

In [None]:
# R squared Error 
train_error_score = metrics.r2_score(Y_train, training_data_prediction) 

print("R squared Error - for Training  Data: ", train_error_score) 


# Predict using Test data 

In [None]:
Y_pred = lin_reg_model.predict(X_test) 

test_error_score = metrics.r2_score(Y_test, Y_pred)

print("R squared Error - Test: ", test_error_score) 

# Create scatterplot with regression line 

In [None]:
sns.regplot(Y_test, Y_pred, scatter_kws={"color": "green"}, line_kws={"color": "blue"}) 