In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Business Problems

For example:
* How should we allocate our limited marketing budget for next year?
* What type on influencers should we focus on?
* Can we cut budget from TV, as it is too expensive?
* Finally, set your assumptions of the marketing budget and influencer (if any). Can you try to predict the expected sales based on the best ML model? 

# Importing the Libraries
First we import the required libraries:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

# Reading the Data

In [3]:
df = pd.read_csv('Dummy Data HSS.csv')

We want to look if there's any missing data, as follows:

We can see there are mising data in several columns. Thus, we want to fill the missing data with its average, as follows:

Now we have a dataframe that has no missing values. Now, we want to encode the categorical variable to dummy variables, as follows:

In [4]:
df = pd.get_dummies(df) # updates the df again

# see the updated df
df["Influencer_Macro"] = df["Influencer_Macro"].map({True: 1, False: 0})
df["Influencer_Mega"] = df["Influencer_Mega"].map({True: 1, False: 0})
df["Influencer_Micro"] = df["Influencer_Micro"].map({True: 1, False: 0})
df["Influencer_Nano"] = df["Influencer_Nano"].map({True: 1, False: 0})
df

Unnamed: 0,TV,Radio,Social Media,Sales,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano
0,16.0,6.566231,2.907983,54.732757,0,1,0,0
1,13.0,9.237765,2.409567,46.677897,0,1,0,0
2,41.0,15.886446,2.913410,150.177829,0,1,0,0
3,83.0,30.020028,6.922304,298.246340,0,1,0,0
4,15.0,8.437408,1.405998,56.594181,0,0,1,0
...,...,...,...,...,...,...,...,...
4567,26.0,4.472360,0.717090,94.685866,0,0,1,0
4568,71.0,20.610685,6.545573,249.101915,0,0,0,1
4569,44.0,19.800072,5.096192,163.631457,0,0,1,0
4570,71.0,17.534640,1.940873,253.610411,1,0,0,0


In [5]:
col_avg = df.mean()

df = df.fillna(col_avg) # updates the df


Now for the purpose of simplicity, we shift the column 'Sales' to the end of the table, as follows:

In [6]:
df.columns # getting the column names

Index(['TV', 'Radio', 'Social Media', 'Sales', 'Influencer_Macro',
       'Influencer_Mega', 'Influencer_Micro', 'Influencer_Nano'],
      dtype='object')

In [7]:
df = df[['TV', 'Radio', 'Social Media', 'Influencer_Macro',
       'Influencer_Mega', 'Influencer_Micro', 'Influencer_Nano', 'Sales']]

# see the updated df

df.head()

Unnamed: 0,TV,Radio,Social Media,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano,Sales
0,16.0,6.566231,2.907983,0,1,0,0,54.732757
1,13.0,9.237765,2.409567,0,1,0,0,46.677897
2,41.0,15.886446,2.91341,0,1,0,0,150.177829
3,83.0,30.020028,6.922304,0,1,0,0,298.24634
4,15.0,8.437408,1.405998,0,0,1,0,56.594181


Out of curiosity, we just want to explore whether there is any correlation of Sales with its predictors:

In [8]:
df.corr()

Unnamed: 0,TV,Radio,Social Media,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano,Sales
TV,1.0,0.866885,0.52701,0.021335,-0.01263,-0.004863,-0.003645,0.996652
Radio,0.866885,1.0,0.606793,0.009518,-0.005071,0.004212,-0.008601,0.867369
Social Media,0.52701,0.606793,1.0,0.011631,0.013072,-0.013312,-0.011351,0.528121
Influencer_Macro,0.021335,0.009518,0.011631,1.0,-0.332131,-0.331171,-0.328482,0.019267
Influencer_Mega,-0.01263,-0.005071,0.013072,-0.332131,1.0,-0.338211,-0.335465,-0.011701
Influencer_Micro,-0.004863,0.004212,-0.013312,-0.331171,-0.338211,1.0,-0.334495,-0.004099
Influencer_Nano,-0.003645,-0.008601,-0.011351,-0.328482,-0.335465,-0.334495,1.0,-0.003289
Sales,0.996652,0.867369,0.528121,0.019267,-0.011701,-0.004099,-0.003289,1.0


Now we have a clean data. Now, we will create the independent and dependent variables (x and y), as follows:

In [9]:
x = df.iloc[:,0:-1].values
y = df.iloc[:,-1:].values

Let's see some preview of x and y:

Looks good. Now, let's split our data for training and testing:

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

Let's see some previews:

We want to know the number of data used for training and testing, as follows:

In [11]:
print(len(x_train),len(x_test))

3429 1143


# Analysis

Now we will train and predict the data based on several regression models:
* Linear
* Random Forest
* Decision Tree
* Support Vector
* Polynomial

For each regression model, we will evaluate its r2_score and root mean squared error (RMSE). The higher r2_score the better; the lower RMSE, the better.

## Linear Regression

In [12]:
lr_regressor = LinearRegression() # instantiate the Linear Regression module
lr_regressor.fit(x_train, y_train) # training the data

# after training the data, perform prediction:

y_pred_lr = lr_regressor.predict(x_test) # this is the prediction 

# evaluate the r2_score and RMSE between prediction and real data

print(r2_score(y_test, y_pred_lr))
print(mse(y_test, y_pred_lr))

0.9968356233955061
28.27802874588883


In [13]:
print(lr_regressor.coef_, lr_regressor.intercept_)

[[ 3.50644297e+00  1.22712795e-01  9.21659192e-02 -3.31376131e-01
   1.78149571e-01  1.54478844e-01 -1.25228393e-03]] [0.31700915]


In [14]:
df.columns

Index(['TV', 'Radio', 'Social Media', 'Influencer_Macro', 'Influencer_Mega',
       'Influencer_Micro', 'Influencer_Nano', 'Sales'],
      dtype='object')

Linear Regression model shows that:

**Sales = 3.50 TV + 0.14 Radio + 0.03 Social Media - 0.43 Macro + 0.13 Mega + 0.07 Micro + 0.22 Nano**

## Random Forest Regression

In [15]:
rf_regressor = RandomForestRegressor() # instantiate the Random Forest Regression module
rf_regressor.fit(x_train, y_train) # training the data

# after training the data, perform prediction:

y_pred_rf = rf_regressor.predict(x_test) # prediction data

# evaluate the r2_score and RMSE between prediction and real data

print(r2_score(y_test, y_pred_rf))
print(mse(y_test, y_pred_rf)**0.5)

  return fit_method(estimator, *args, **kwargs)


0.997987674134219
4.24062274205614


## Decision Tree Regression

In [16]:
# Similar processes with the above two examples

dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(x_train, y_train)

y_pred_dt = dt_regressor.predict(x_test)

print(r2_score(y_test, y_pred_dt))
print(mse(y_test, y_pred_dt)**0.5)

0.9977611498265825
4.472939240655621


## Support Vector Regression

For SVR, we need to perform feature scaling. In short, this is required because of some mathematical assumptions. So first we will perform feature scaling:

### Feature Scaling for Support Vector Regression

To avoid confusion of variable names, we will create new independent and dependent variable names: a and b

In [17]:
a = df.iloc[:,0:-1].values
b = df.iloc[:,-1:].values

# reshape b

b = b.reshape(len(b), 1)

In [18]:
# now we split to train and test

a_train, a_test, b_train, b_test = train_test_split(a, b)

In [19]:
# now perform scaling

scala = StandardScaler()
scalb = StandardScaler()

a_train = scala.fit_transform(a_train)
b_train = scalb.fit_transform(b_train)

In [20]:
# training the SVR 

sv_regressor = SVR()
sv_regressor.fit(a_train, b_train)

  y = column_or_1d(y, warn=True)


In [21]:
# create the prediction

b_pred = scalb.inverse_transform(sv_regressor.predict(scala.transform(a_test)))

ValueError: Expected 2D array, got 1D array instead:
array=[-0.36947589  1.47749081  1.68874006 ... -0.21499922 -1.2702504
 -0.29254207].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# evaluate the model

print(r2_score(b_test, b_pred))
print(mse(b_test, b_pred)**0.5)

0.9963172882900326
5.667372043551951


### Polynomial Regression

For Polynomial Regression, we also have to perform polynomial feature scaling. 

In [None]:
poly = PolynomialFeatures()
po_regressor = LinearRegression()

# training the data

po_regressor.fit(poly.fit_transform(x_train), y_train)

LinearRegression()

In [None]:
# prediction training

y_pred_po = po_regressor.predict(poly.fit_transform(x_test))

# evaluate the model

print(r2_score(y_test, y_pred_po))
print(mse(y_test, y_pred_po)**0.5)

0.9903363698869221
9.194086660017375


## Model Evaluation

* Based on our analysis, it can be inferred that Support Vector Regression yields the highest R2 score and lowest RMSE. Thus, SVR will be the chosen model. We can use SVR model to predict the sales based on the input strategy.
* Based on linear regression, we can infer that we can focus on TV, Radio, and Nano Influencer. We can cut budget related to other factors.

# Concluding Remarks

* I want this notebook to be as understandable as possible so students or people like me who are not from math/data science/computer science background can infer easily what is happening in each line of code. 
* I understand there will be several math/ML/any other assumptions that I might have skipped in this notebook (e.g. assumptions for Linear Regression, feature scaling, parameter tuning). Please let me know in the comments to improve it.
* Overall, this is part of my learning journey as well as a business management scholar. So, I will just focus on the 'broad overview' of the algorithms, and shed a light more on *making sense* of how data can be used for better decision making

You can upvote if you like this notebook. Happy (machine) learning!