# Forecasting the amount of a product that customers will purchase taking various factors

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import math
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection 
from sklearn.linear_model import LinearRegression

In [2]:
data=pd.read_csv('advertising.csv')
data

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,14.0
197,177.0,9.3,6.4,14.8
198,283.6,42.0,66.2,25.5


Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,14.0
197,177.0,9.3,6.4,14.8
198,283.6,42.0,66.2,25.5


In [3]:
data.describe(include='all')

Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,15.1305
std,85.854236,14.846809,21.778621,5.283892
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,11.0
50%,149.75,22.9,25.75,16.0
75%,218.825,36.525,45.1,19.05
max,296.4,49.6,114.0,27.0


Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,15.1305
std,85.854236,14.846809,21.778621,5.283892
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,11.0
50%,149.75,22.9,25.75,16.0
75%,218.825,36.525,45.1,19.05
max,296.4,49.6,114.0,27.0


In [None]:
data.info()

In [None]:
data.head(10)

In [None]:
sns.histplot(data['Sales'],color='green')

In [None]:
q=data['Sales'].quantile(0.99)
data1=data[data['Sales']<q]
data1.describe(include='all')

In [None]:
sns.displot(data1['Sales'],color='red')

In [None]:
sns.displot(data1['TV'],color='blue')

In [None]:
sns.displot(data1['Radio'],color='violet')

In [None]:
sns.displot(data1['Newspaper'],color='yellow')

In [None]:
f,(ax1,ax2,ax3)=plt.subplots(1,3,sharey=True,figsize=(15,3))
ax1.scatter(data1['Sales'],data1['TV'])
ax1.set_title('TV and Sales')
ax2.scatter(data1['Sales'],data1['Radio'])
ax2.set_title('Radio and Sales')
ax3.scatter(data1['Sales'],data1['Newspaper'])
ax3.set_title('Newspaper and Sales')

In [None]:
log_sale=np.log(data1['Sales'])
data1['log_sale']=log_sale
data1

In [None]:
f,(ax1,ax2,ax3)=plt.subplots(1,3,sharey=True,figsize=(15,3))
ax1.scatter(data1['log_sale'],data1['TV'])
ax1.set_title('TV and Sales')
ax2.scatter(data1['log_sale'],data1['Radio'])
ax2.set_title('Radio and Sales')
ax3.scatter(data1['log_sale'],data1['Newspaper'])
ax3.set_title('Newspaper and Sales')
plt.show()

In [None]:
data1.columns.values

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables=data1[['TV','Radio','Newspaper']]
vif=pd.DataFrame()
vif["VIF"]=[variance_inflation_factor(variables.values,i) for i in range(variables.shape[1])]
vif['features']=variables.columns
vif

In [None]:
targets=data1['log_sale']
input=data1.drop(['log_sale'],axis=1)

In [None]:
scaler=StandardScaler()
scaler.fit(input)
input_scaled=scaler.transform(input)

In [None]:
x_train, x_test, y_train, y_test= train_test_split(input_scaled,targets,test_size= 0.2,random_state=365)

In [None]:
reg=LinearRegression()
reg.fit(x_train,y_train)

In [None]:
y_hat=reg.predict(x_train)
plt.scatter(y_train,y_hat)
plt.xlabel('Targets(y_train)',size=10)
plt.ylabel('Predictions(y_hat)',size=10)
plt.xlim(1,4)
plt.ylim(1,4)
plt.show()

In [None]:
sns.displot(y_train-y_hat)

In [None]:
reg.score(x_train,y_train)

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
reg.summary=pd.DataFrame(input.columns.values,columns=['Features'])
reg.summary['Weights']=reg.coef_
reg.summary

In [None]:
y_hat_test=reg.predict(x_test)
plt.scatter(y_test,y_hat_test,alpha=0.7)
plt.xlabel('Targets(y_test)',size=10)
plt.ylabel('Predictions(y_hat_test)',size=10)
plt.xlim(1,4)
plt.ylim(1,4)
plt.show()

In [None]:
data2=pd.DataFrame(np.exp(y_hat_test),columns=['Prediction'])
data2.head()

In [None]:
data2['Target']=np.exp(y_test)
data2

In [None]:
y_test=y_test.reset_index(drop=True)
y_test.head()

In [None]:
data2['Target']=np.exp(y_test)
data2

In [None]:
data2['Residual']=data2['Target']-data2['Prediction']
data2['Difference%']=np.absolute(data2['Residual']/data2['Target']*100)
data2

In [None]:
data2.describe()

In [None]:
pd.options.display.max_rows=100
pd.set_option('display.float_format',lambda x:'%.2f'%x)
data2.sort_values(by=['Difference%'])