In [2]:
import numpy as np
import pandas as pd

# Load data from CSV
file_path = r'D:\Uni\Semester 05\EN3150 - Pattern Recognition\Assignments\01\Advertising.csv'
df = pd.read_csv(file_path)
print(df.head())


   sample index     TV  radio  newspaper  sales
0             1  230.1   37.8       69.2   22.1
1             2   44.5   39.3       45.1   10.4
2             3   17.2   45.9       69.3    9.3
3             4  151.5   41.3       58.5   18.5
4             5  180.8   10.8       58.4   12.9


In [3]:
from sklearn.model_selection import train_test_split

X = df[['TV', 'radio', 'newspaper']]  # Independent variables
y = df['sales']  # Dependent variable

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("Test dataset elements",X.shape)
print("Train dataset elements",X_train.shape)
print("Test dataset elements",X_test.shape)



Test dataset elements (200, 3)
Train dataset elements (160, 3)
Test dataset elements (40, 3)


In [4]:
from sklearn import datasets, linear_model

model = linear_model.LinearRegression()

# Train the model using the training sets
model.fit(X_train, y_train)

# The coefficients
print("Coefficients: \n", model.coef_)
# The intercept
print("Intercept: \n", model.intercept_)



Coefficients: 
 [ 0.04458402  0.19649703 -0.00278146]
Intercept: 
 2.994893030495323


In [5]:
from sklearn.metrics import mean_squared_error, r2_score

#predictions on training data
yhat = model.predict(X_train)

#RSS
RSS = np.sum((y_train - yhat)**2)
print('RSS = ', RSS)

#RSE
N = len(y_train)
p = X_train.shape[1]  #number of features
RSE = np.sqrt(RSS / (N - p - 1))
print('RSE = ', RSE)

#TSS
TSS = np.sum((y_train - np.mean(y_train))**2)
print('TSS = ', TSS)

#R^2
R2 = 1 - (RSS / TSS)
print('R2 = ', R2)


from scipy.stats import t

w_1 = model.coef_
sigma2 = np.var(y_train - yhat)

#std. errors for each feature
X_mean = X_train.mean().values  #mean values of each feature
SE2w1 = sigma2 / ((X_train - X_mean)**2).sum()
print('Standard errors for coefficients : ', np.sqrt(SE2w1).values) #tv,radio,newspaper

#t-statistic for each feature
tw1 = w_1 / np.sqrt(SE2w1)
print('t-statistic for coefficients: ', tw1.values) #tv,radio,newspaper

#p-value for each feature
pw1 = 2 * (1 - t.cdf(np.abs(tw1), df = N-p-1))
print('p-value for coefficients: ', pw1) #tv,radio,newspaper


RSS =  385.0903609310249
RSE =  1.571154974751395
TSS =  4127.951
R2 =  0.9067114990146383
Standard errors for coefficients :  [0.00144818 0.00834903 0.00573721]
t-statistic for coefficients:  [30.78630178 23.53530995 -0.48481154]
p-value for coefficients:  [0.         0.         0.62849009]


In [7]:
#predictions on testing data
yhat = model.predict(X_test)

#RSS
RSS = np.sum((y_test - yhat)**2)
print('RSS = ', RSS)

#RSE
N = len(y_test)
p = X_test.shape[1]  #number of features
RSE = np.sqrt(RSS / (N - p - 1))
print('RSE = ', RSE)

#TSS
TSS = np.sum((y_test - np.mean(y_test))**2)
print('TSS = ', TSS)

#R^2
R2 = 1 - (RSS / TSS)
print('R2 = ', R2)


w_1 = model.coef_
sigma2 = np.var(y_test - yhat)

#std. errors for each feature
X_mean = X_test.mean().values  #mean values of each feature
SE2w1 = sigma2 / ((X_test - X_mean)**2).sum()
print('Standard errors for coefficients : ', np.sqrt(SE2w1).values) #tv,radio,newspaper

#t-statistic for each feature
tw1 = w_1 / np.sqrt(SE2w1)
print('t-statistic for coefficients: ', tw1.values) #tv,radio,newspaper

#p-value for each feature
pw1 = 2 * (1 - t.cdf(np.abs(tw1), df = N-p-1))
print('p-value for coefficients: ', pw1) #tv,radio,newspaper


RSS =  176.08473165798722
RSE =  2.2116153702791994
TSS =  1258.7777499999997
R2 =  0.8601145185017869
Standard errors for coefficients :  [0.00376895 0.02171458 0.01442878]
t-statistic for coefficients:  [11.82930416  9.04908202 -0.19277191]
p-value for coefficients:  [5.81756865e-14 8.35869152e-11 8.48221020e-01]


In [10]:
#define budget allocations for different scenarios
budget_scenario_1 = [25000, 25000, 0] #25000$ to TV, 25000$ to radio
budget_scenario_2 = [50000, 0, 0]  #50000$ to TV
budget_scenario_3 = [0, 50000, 0]  #50000$ to radio

#make predictions for each scenario
sales_scenario_1 = model.predict([budget_scenario_1])
sales_scenario_2 = model.predict([budget_scenario_2])
sales_scenario_3 = model.predict([budget_scenario_3])

#print the predicted sales for each scenario
print("Predicted Sales (Scenario 1):", sales_scenario_1[0])
print("Predicted Sales (Scenario 2):", sales_scenario_2[0])
print("Predicted Sales (Scenario 3):", sales_scenario_3[0])


Predicted Sales (Scenario 1): 6030.021249914729
Predicted Sales (Scenario 2): 2232.1958990287117
Predicted Sales (Scenario 3): 9827.846600800745


