In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
sns.set(style= 'darkgrid')

In [1]:
pip freeze


absl-py==2.1.0
annotated-types==0.7.0
asttokens==2.4.1
astunparse==1.6.3
attrs==24.2.0
blis==1.0.1
catalogue==2.0.10
certifi==2024.8.30
charset-normalizer==3.4.0
click==8.1.7
cloudpathlib==0.20.0
colorama==0.4.6
comm==0.2.2
confection==0.1.5
contourpy==1.3.0
cycler==0.12.1
cymem==2.0.10
debugpy==1.8.7
decorator==5.1.1
executing==2.1.0
fastjsonschema==2.20.0
flatbuffers==24.3.25
fonttools==4.54.1
gast==0.6.0
google-pasta==0.2.0
grpcio==1.68.1
h5py==3.12.1
idna==3.10
ipykernel==6.29.5
ipython==8.28.0
jedi==0.19.1
Jinja2==3.1.4
joblib==1.4.2
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
jupyter_client==8.6.3
jupyter_core==5.7.2
keras==3.7.0
kiwisolver==1.4.7
langcodes==3.5.0
language_data==1.3.0
libclang==18.1.1
marisa-trie==1.2.1
Markdown==3.7
markdown-it-py==3.0.0
MarkupSafe==3.0.2
matplotlib==3.9.2
matplotlib-inline==0.1.7
mdurl==0.1.2
ml-dtypes==0.4.1
murmurhash==1.0.11
namex==0.0.8
nbformat==5.10.4
nest-asyncio==1.6.0
nltk==3.9.1
numpy==2.0.2
opt_einsum==3.4.0
optree==0.13.

In [3]:
data = pd.read_csv('AdvertAndSales.csv')
data.head()

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,Mega,54.732757
1,13.0,9.237765,2.409567,Mega,46.677897
2,41.0,15.886446,2.91341,Mega,150.177829
3,83.0,30.020028,6.922304,Mega,298.24634
4,15.0,8.437408,1.405998,Micro,56.594181


In [4]:
data.isnull().sum().sort_values(ascending=False)

TV              10
Social Media     6
Sales            6
Radio            4
Influencer       0
dtype: int64

In [5]:
for i in data.columns:
    if ((data[i].isnull().sum() / len(data)) * 100).round(2) < 30:
        if data[i].dtypes == 'O':
            data[i].fillna(data[i].mode()[0], inplace = True)
        else:
            data[i].fillna(data[i].median(), inplace = True)
    else:
        data.drop(i, axis = 1, inplace = True)

data.isnull().sum().sort_values(ascending=False)

TV              0
Radio           0
Social Media    0
Influencer      0
Sales           0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TV            4572 non-null   float64
 1   Radio         4572 non-null   float64
 2   Social Media  4572 non-null   float64
 3   Influencer    4572 non-null   object 
 4   Sales         4572 non-null   float64
dtypes: float64(4), object(1)
memory usage: 178.7+ KB


<h3><b>Data Preprocessing<hr></b></h3>

In [7]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

df = data.copy()

for i in df.drop("Sales", axis=1).columns:
    if i in df.select_dtypes(include= 'number').columns:
        scaler = StandardScaler()
        df[i] = scaler.fit_transform(df[[i]])
        joblib.dump(scaler, open(f'{i}.scaler.pkl', 'wb'))
    else:
        encoder = LabelEncoder()
        df[i] = encoder.fit_transform(df[[i]])
        joblib.dump(encoder, open(f'{i}.encoder.pkl', 'wb'))
df.head()

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,-1.458765,-1.198745,-0.187979,1,54.732757
1,-1.573736,-0.922522,-0.413405,1,46.677897
2,-0.500678,-0.235083,-0.185524,1,150.177829
3,1.108909,1.226255,1.627637,1,298.24634
4,-1.497089,-1.005275,-0.867304,2,56.594181


<h3><b>Modelling<hr></b></h3>

In [14]:
x = df.drop('Sales', axis=1)
y = df.Sales


from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.25, random_state = 40)

trains = pd.concat([xtrain, ytrain], axis=1)
tests = pd.concat([xtest, ytest], axis=1)

display(xtest.head())
display(ytest.head ())

Unnamed: 0,TV,Radio,Social Media,Influencer
1740,-0.19409,-0.428411,-1.139247,1
4416,-1.113854,-0.865625,-1.275407,0
1226,-0.270737,-0.06317,0.121957,3
2848,-0.653972,-1.300665,0.462995,0
1826,-1.458765,-1.831002,0.097776,1


1740    178.601661
4416     92.848374
1226    166.124099
2848    134.445973
1826     56.016896
Name: Sales, dtype: float64

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

model = RandomForestRegressor()

model.fit(xtrain, ytrain)
val = model.predict(xtrain)

print(f"Model Cross Validation is: {round(r2_score(val, ytrain)*100, 2)}\n")

pred = model.predict(xtest)
print(f"Model Performance is: {round(r2_score(ytest, pred)*100, 2)}")

Model Cross Validation is: 99.92

Model Performance is: 99.24


In [20]:
# save the model

joblib.dump(model, open(f'advertModel.pkl', 'wb'))

