# Problem Statement

You have been provided with a dataset that contains the costs of advertising on different media channels and the corresponding sales of XYZ firm. Evaluate the dataset to:

1. Find the features or media channels used by the firm

2. Find the sales figures for each channel

3. Create a model to predict the sales outcome

4. Split it into training and testing datasets for the model

5. Calculate the mean squared error (MSE)


In [2]:
# import the required libraries
import pandas as pd

In [4]:
# import the advertising dataset
df_adv_data = pd.read_csv('Advertising.csv', index_col=0)
print('File loaded')

File loaded


In [5]:
# view top 5 records
df_adv_data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [7]:
# view the dataset size
df_adv_data.size

800

In [8]:
# view the shape of the dataset
df_adv_data.shape

(200, 4)

In [9]:
# view the columns of the dataset
df_adv_data.columns

Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')

In [10]:
# create a feature object from the columns
X_feature = df_adv_data[['Newspaper','Radio','TV']]

In [13]:
# view feature object
X_feature.head()

Unnamed: 0,Newspaper,Radio,TV
1,69.2,37.8,230.1
2,45.1,39.3,44.5
3,69.3,45.9,17.2
4,58.5,41.3,151.5
5,58.4,10.8,180.8


In [14]:
# create target object from sales column which is a response in the dataset
Y_target = df_adv_data[['Sales']]

In [15]:
# view the target object
Y_target.head()

Unnamed: 0,Sales
1,22.1
2,10.4
3,9.3
4,18.5
5,12.9


In [None]:
# view the future object

In [19]:
X_feature.shape

(200, 3)

In [20]:
# view target object shape
Y_target.shape

(200, 1)

In [28]:
# split test and training data
# by default 75% training data and 25% testing data
from sklearn.model_selection   import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_feature,Y_target,random_state=1)

In [29]:
# view shape of train and test datasets fro both feature and response
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(150, 3)
(150, 1)
(50, 3)
(50, 1)


In [34]:
# Linear regression model 
from sklearn.linear_model import LinearRegression


In [35]:
linreg = LinearRegression()

In [36]:
linreg.fit(x_train, y_train)

LinearRegression()

In [None]:
# print the intercepts and coefficients

In [37]:
print(linreg.coef_)
print(linreg.copy_X)

[[0.00345046 0.17915812 0.04656457]]
True


In [38]:
# prediction
y_pred = linreg.predict(x_test)
y_pred

array([[21.70910292],
       [16.41055243],
       [ 7.60955058],
       [17.80769552],
       [18.6146359 ],
       [23.83573998],
       [16.32488681],
       [13.43225536],
       [ 9.17173403],
       [17.333853  ],
       [14.44479482],
       [ 9.83511973],
       [17.18797614],
       [16.73086831],
       [15.05529391],
       [15.61434433],
       [12.42541574],
       [17.17716376],
       [11.08827566],
       [18.00537501],
       [ 9.28438889],
       [12.98458458],
       [ 8.79950614],
       [10.42382499],
       [11.3846456 ],
       [14.98082512],
       [ 9.78853268],
       [19.39643187],
       [18.18099936],
       [17.12807566],
       [21.54670213],
       [14.69809481],
       [16.24641438],
       [12.32114579],
       [19.92422501],
       [15.32498602],
       [13.88726522],
       [10.03162255],
       [20.93105915],
       [ 7.44936831],
       [ 3.64695761],
       [ 7.22020178],
       [ 5.9962782 ],
       [18.43381853],
       [ 8.39408045],
       [14

In [39]:
# Import required libraries for calculating MSE (mean square error) to measure accuracy
from sklearn import metrics
import numpy as np

In [40]:
 print(np.sqrt(metrics.mean_squared_error(y_test,y_pred))) # MSE

1.404651423032894
