# Importing modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

# Search Dataset from Kaggle

In [2]:
!kaggle datasets list -s 'bumba5341/advertisingcsv'

ref                       title            size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------  ---------------  ----  -------------------  -------------  ---------  ---------------  
bumba5341/advertisingcsv  Advertising.csv   2KB  2020-04-01 06:52:08          17409         36  0.3529412        


# Download the dataset

In [3]:
!kaggle datasets download -d bumba5341/advertisingcsv

advertisingcsv.zip: Skipping, found more recently modified local copy (use --force to force download)


# Unzipping the dataset

In [4]:
import zipfile

In [5]:
with zipfile.ZipFile("advertisingcsv.zip","r") as file:
    file.extractall("adv")

In [6]:
import os

In [7]:
os.listdir("adv")

['Advertising.csv']

# Read the dataset

In [8]:
df=pd.read_csv('adv/Advertising.csv')

In [None]:
df

In [None]:
df=df[['TV','Radio','Newspaper','Sales']]
df

# Checking null values in the dataframe

In [None]:
df.isnull().sum()

In [None]:
#data visualisation
sns.heatmap(df.isnull())

Observation: No null values are present in our dataset.

In [None]:
df.shape

Total Rows: 200 Total Columns: 4

In [None]:
df.dtypes

All the features have numerical data

In [None]:
df.columns

# EDA:

In [None]:
for i in df.columns:
    if (df[i].dtypes=='float')|(df[i].dtypes=='int64'):
        sns.histplot(x=i,data=df)
        plt.show()    

Observation:

1.The amount which is most frequently spent on TV advertisement is in the range of (200-230) dollars.

2.The amount which is most frequently spent on radio advertisement is in the range of (0-8)dollars.

3.The amount which is most frequently spent on newspaper advertisement is in the range of (0-10)dollars.

Bivariate analysis

In [None]:
for i in df.columns:        
    plt.xticks(rotation=90)
    sns.scatterplot(x=i,y='Sales',data=df)
    plt.show() 

Observation: 
    
1.With increase in advertising dollar spent on TV,sales also increases. 

2.With increase in advertising dollar spent on radio,sales also increases. 

3.There is no clear image on sales with increase on dollar spent on newspaper

# Data Set Description

In [None]:
df.describe()

In [None]:
#visualisation
plt.figure(figsize=(15,7))
sns.heatmap(round(df.describe()[1:].transpose(),2),annot=True,linewidth=1,linecolor='yellow',fmt='.2f')
plt.show()

Observation:

1.The mean and median for TV,radio are almost equal indicating they might be normally distributed.

2.Newspaper seems to be right skewed since mean>median.

3.There is huge difference beteen min and 25% for TV indicating outliers might be present.

4.There is huge difference beteen 75% and max for newspaper indicating outliers might be present.

# Correlation of columns with target columns

In [None]:
df.corr()['Sales'].sort_values()

Observation:

1.TV is 78% correlated with target.

2.Radio is 58% correlated with target.

3.Newspaper is 23% correlated with target.

# Checking Skewness:

In [None]:
df.skew()

keeping +/-0.5 as the range for the skewness, newspaper is the columns which doesnot lie within this range and need further treatment.

In [None]:
for i in df.columns:        
    plt.xticks(rotation=90)
    sns.distplot(df[i])
    plt.show() 

Observation: TV and radio are normally distributed. Newspaper is right skewed.

# Outliers Checking:

In [None]:
df.plot(kind='box',subplots=True,layout=(2,2),figsize=(10,10))

Observation: From the above outcome we can see very few oultiers present in newspaper but the datapoints are very close to the whiskers and cannot be trated as outlier.

# Data Preprocessing

# Transformation to remove skewness

In [None]:
df['Newspaper']=np.sqrt(df['Newspaper'])

In [None]:
df.skew()


In [None]:
sns.distplot(df['Newspaper'])

# Seperating the data into features and target

In [None]:
#independent column
x=df.iloc[:,:-1]

In [None]:
#target
y=df.iloc[:,-1]

# Since our outcome has continuous variable we will use regression mode

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
for i in range(0,100):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3,random_state=i)
    lr.fit(x_train,y_train)
    train_pred=lr.predict(x_train)
    test_pred=lr.predict(x_test)
    if round(r2_score(y_train,train_pred),3)==round(r2_score(y_test,test_pred),3):
        print('training score :',r2_score(y_train,train_pred))
        print('testing score :',r2_score(y_test,test_pred))
        print('At random_state=',i,"the model performs well",'\n')

# Cross Validation Of the Model:

In [None]:
from sklearn.model_selection import cross_val_score
for k in range(2,15):
    cv_score=cross_val_score(lr,x,y,cv=k)
    cv_mean=cv_score.mean()
    print('At crossfold=',k,'the cv score is',cv_mean)
    print('training score :',r2_score(y_train,train_pred))
    print('testing score :',r2_score(y_test,test_pred))
    print('\n')

Since the number of folds dont have such impact on the accuracy score and cv_score.So cv=4 is selected. HERE WE HAVE HANDLED THE PROBLEM OF OVERFITTING AND UNDERFITTING BY CHECKING TRAINING AND TESTING SCORE

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(y_test,test_pred,color='r')
plt.plot(y_test,y_test,color='b',linewidth='4')
plt.show()

Observation: Some predicted values are away from the best fit line.

In [None]:
from sklearn.linear_model import Ridge
rg=Ridge(alpha=18)
rg.fit(x_train,y_train)
rgpred=rg.predict(x_test)
rg.score(x_train,y_train)
cv_score=cross_val_score(rg,x,y,cv=4)
cv_mean=cv_score.mean()
print('testing score :',r2_score(y_test,rgpred))
print('training score :',rg.score(x_train,y_train))
print('The CV score is:',cv_mean)
print('Error:')
print('Mean absolue error:',mean_absolute_error(y_test,rgpred))
print('Mean squared error:',mean_squared_error(y_test,rgpred))
print('Root Mean squared error:',np.sqrt(mean_squared_error(y_test,rgpred)))

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr=DecisionTreeRegressor()
dtr.fit(x_train,y_train)
dtrpred=dtr.predict(x_test)
dtr.score(x_train,y_train)
cv_score=cross_val_score(dtr,x,y,cv=4)
cv_mean=cv_score.mean()
print('testing score :',r2_score(y_test,dtrpred))
print('training score :',dtr.score(x_train,y_train))
print('The CV score is:',cv_mean)
print('Error:')
print('Mean absolue error:',mean_absolute_error(y_test,dtrpred))
print('Mean squared error:',mean_squared_error(y_test,dtrpred))
print('Root Mean squared error:',np.sqrt(mean_squared_error(y_test,dtrpred)))

# Support Vector Regressor

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
svr=SVR()                         
svr.fit(x_train,y_train)
svrpred=svr.predict(x_test)
svr.score(x_train,y_train) 
cv_score=cross_val_score(svr,x,y,cv=4)
cv_mean=cv_score.mean()
print('The CV score is:',cv_mean)
print('testing score :',r2_score(y_test,svrpred))
print('training score :',svr.score(x_train,y_train))
print('Error:')
print('Mean absolue error:',mean_absolute_error(y_test,svrpred))
print('Mean squared error:',mean_squared_error(y_test,svrpred))
print('Root Mean squared error:',np.sqrt(mean_squared_error(y_test,svrpred)))

# KNeighborsRegressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knr=KNeighborsRegressor()
knr.fit(x_train,y_train)
knrpred=knr.predict(x_test)
knr.score(x_train,y_train)
cv_score=cross_val_score(knr,x,y,cv=4)
cv_mean=cv_score.mean()
print('The CV score is:',cv_mean)
print('testing score :',r2_score(y_test,knrpred))
print('training score :',knr.score(x_train,y_train))
print('Error:')
print('Mean absolue error:',mean_absolute_error(y_test,knrpred))
print('Mean squared error:',mean_squared_error(y_test,knrpred))
print('Root Mean squared error:',np.sqrt(mean_squared_error(y_test,knrpred)))

# Ensemble Methods

Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()     
rf.fit(x_train,y_train)
rfpred=rf.predict(x_test)
rf.score(x_train,y_train)
cv_score=cross_val_score(rf,x,y,cv=4)
cv_mean=cv_score.mean()
print('The CV score is:',cv_mean)
print('testing score :',r2_score(y_test,rfpred))
print('training score :',rf.score(x_train,y_train))
print('Error:')
print('Mean absolue error:',mean_absolute_error(y_test,rfpred))
print('Mean squared error:',mean_squared_error(y_test,rfpred))
print('Root Mean squared error:',np.sqrt(mean_squared_error(y_test,rfpred)))

Adaboost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ad=AdaBoostRegressor(n_estimators=300)      
ad.fit(x_train,y_train)
adpred=ad.predict(x_test)
ad.score(x_train,y_train)
ad.score(x_train,y_train)
cv_score=cross_val_score(ad,x,y,cv=4)
cv_mean=cv_score.mean()
print('The CV score is:',cv_mean)
print('testing score :',r2_score(y_test,adpred))
print('training score :',ad.score(x_train,y_train))
print('Error:')
print('Mean absolue error:',mean_absolute_error(y_test,adpred))
print('Mean squared error:',mean_squared_error(y_test,adpred))
print('Root Mean squared error:',np.sqrt(mean_squared_error(y_test,adpred)))

# CONCLUSION:

Comparing the performance metrics we selecting Random Forest as our final model as it is very perforing extremely well in comparison to oher model.

In [None]:
original=np.array(y_test)
predicted=np.array(rfpred)
df_com=pd.DataFrame({'Original':original,'Predicted':predicted})
df_com

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_test,rfpred,color='r')
plt.plot(y_test,y_test,color='b',linewidth='4')
plt.show()