In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Data Preprocessing


In [59]:
# read data from csv
df = pd.read_csv('Dummy Data HSS.csv')
df.head()

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,Mega,54.732757
1,13.0,9.237765,2.409567,Mega,46.677897
2,41.0,15.886446,2.91341,Mega,150.177829
3,83.0,30.020028,6.922304,Mega,298.24634
4,15.0,8.437408,1.405998,Micro,56.594181


In [60]:
# break the Influencer attribute into 4 attributes 
df = pd.get_dummies(df)
df

Unnamed: 0,TV,Radio,Social Media,Sales,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano
0,16.0,6.566231,2.907983,54.732757,False,True,False,False
1,13.0,9.237765,2.409567,46.677897,False,True,False,False
2,41.0,15.886446,2.913410,150.177829,False,True,False,False
3,83.0,30.020028,6.922304,298.246340,False,True,False,False
4,15.0,8.437408,1.405998,56.594181,False,False,True,False
...,...,...,...,...,...,...,...,...
4567,26.0,4.472360,0.717090,94.685866,False,False,True,False
4568,71.0,20.610685,6.545573,249.101915,False,False,False,True
4569,44.0,19.800072,5.096192,163.631457,False,False,True,False
4570,71.0,17.534640,1.940873,253.610411,True,False,False,False


In [61]:
# Convert true/fase into 1/0
df["Influencer_Macro"] = df["Influencer_Macro"].map({True: 1, False: 0})
df["Influencer_Mega"] = df["Influencer_Mega"].map({True: 1, False: 0})
df["Influencer_Micro"] = df["Influencer_Micro"].map({True: 1, False: 0})
df["Influencer_Nano"] = df["Influencer_Nano"].map({True: 1, False: 0})

df

Unnamed: 0,TV,Radio,Social Media,Sales,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano
0,16.0,6.566231,2.907983,54.732757,0,1,0,0
1,13.0,9.237765,2.409567,46.677897,0,1,0,0
2,41.0,15.886446,2.913410,150.177829,0,1,0,0
3,83.0,30.020028,6.922304,298.246340,0,1,0,0
4,15.0,8.437408,1.405998,56.594181,0,0,1,0
...,...,...,...,...,...,...,...,...
4567,26.0,4.472360,0.717090,94.685866,0,0,1,0
4568,71.0,20.610685,6.545573,249.101915,0,0,0,1
4569,44.0,19.800072,5.096192,163.631457,0,0,1,0
4570,71.0,17.534640,1.940873,253.610411,1,0,0,0


In [62]:
# fill the missing value with average
df = df.fillna(df.mean())
df.isnull().sum()

TV                  0
Radio               0
Social Media        0
Sales               0
Influencer_Macro    0
Influencer_Mega     0
Influencer_Micro    0
Influencer_Nano     0
dtype: int64

In [63]:
# Get the header name of dataset
df.columns

Index(['TV', 'Radio', 'Social Media', 'Sales', 'Influencer_Macro',
       'Influencer_Mega', 'Influencer_Micro', 'Influencer_Nano'],
      dtype='object')

In [64]:
# rearrange the column so that Sale column will be on the last
df = df [['TV', 'Radio', 'Social Media', 'Influencer_Macro',
       'Influencer_Mega', 'Influencer_Micro', 'Influencer_Nano', 'Sales']]

In [65]:
# round number to 2 decimal 
df = round(df,2)
df

Unnamed: 0,TV,Radio,Social Media,Influencer_Macro,Influencer_Mega,Influencer_Micro,Influencer_Nano,Sales
0,16.0,6.57,2.91,0,1,0,0,54.73
1,13.0,9.24,2.41,0,1,0,0,46.68
2,41.0,15.89,2.91,0,1,0,0,150.18
3,83.0,30.02,6.92,0,1,0,0,298.25
4,15.0,8.44,1.41,0,0,1,0,56.59
...,...,...,...,...,...,...,...,...
4567,26.0,4.47,0.72,0,0,1,0,94.69
4568,71.0,20.61,6.55,0,0,0,1,249.10
4569,44.0,19.80,5.10,0,0,1,0,163.63
4570,71.0,17.53,1.94,1,0,0,0,253.61


In [66]:
# Write the preprocessed data into another csv file
df.to_csv("cleaned_data.csv", index=False)