In [4]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as snb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [6]:
# Load the dataset
url='https://raw.githubusercontent.com/smitha-ks/Notebooks/Datasets/Advertising.csv'
df=pd.read_csv(url)

In [7]:
# Print first five rows using head() function
df.head()


Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [8]:
# Check if there are any null values. If any column has null values, treat them accordingly
print(df.isnull().sum())


Unnamed: 0    0
TV            0
radio         0
newspaper     0
sales         0
dtype: int64


In [9]:
df.shape

(200, 5)

In [10]:
#separate features and target
x=df.iloc[:,1:4]
y=df.iloc[:,-1]

In [12]:
# Split the DataFrame into the training and test sets.
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=3)
print(X_train.head())
y_train.head()

       TV  radio  newspaper
77  120.5   28.5       14.2
73  129.4    5.7       31.3
71  109.8   14.3       31.7
78    5.4   29.9        9.4
42  293.6   27.7        1.8


77    14.2
73    11.0
71    12.4
78     5.3
42    20.7
Name: sales, dtype: float64

In [13]:
#Build the Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
#Print the slope and intercept values
print("Intercept: ",lr.intercept_)
print("Slope : ", lr.coef_)
print("Predictions by model post build with training data: ",y_pred)

Intercept:  3.5238550353772666
Slope :  [ 0.04294217  0.1879147  -0.00541318]
Predictions by model post build with training data:  [16.23908522  9.61974227 19.70411458 12.83758096  7.75117142 10.39614031
 23.56568814  9.04287249 17.61434029 13.61264994 12.41895295 14.63059421
 15.36549523 12.91196831 12.44967293 12.02022165 16.18805828 17.47379948
 17.19147955 21.69255609 18.22173873  8.90050617 10.7849561  12.02942207
  6.84754722 13.66447529 22.17451249 13.50945016 22.52702726 11.88138574
 17.03988139 21.52399711 10.71565512  7.85644348 10.20198234  8.44594245
 13.001148   10.77141583 12.17524762  9.8569604  15.45704842 13.04055175
  5.84887331 20.59192848 22.47250008 24.37729901 14.37760834 10.94824196
 16.36148492 18.13598497 11.43279278 14.72792516 16.94299022  8.98448136
 19.51641741 10.89477891 22.74714758 21.18575245 15.70986415 14.876838  ]


In [15]:
#print test Performance measure
#Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("R2 score: ",r2)
print("Mean absolute error: ",mae)
print("Mean squared error: ",mse)

R2 score:  0.8760893953712908
Mean absolute error:  1.3857180211426177
Mean squared error:  4.071612632911027


In [16]:
# Calculate the slope and intercept values for the best fit line.
#intercept value
intercept=lr.intercept_
slope=lr.coef_
print("The slope values : ", slope)
print("The intercept value : ", intercept)

The slope values :  [ 0.04294217  0.1879147  -0.00541318]
The intercept value :  3.5238550353772666
