In [1]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
import re
from sklearn.preprocessing import LabelEncoder # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.ensemble import RandomForestClassifier # type: ignore
from sklearn.metrics import classification_report # type: ignore

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("data/Social_Media_Advertising.csv")

## Análise Exploratória

Dataset got from kaggle [Social Media Advertising](https://www.kaggle.com/datasets/jsonk11/social-media-advertising-dataset/data)

In [3]:
df.head()

Unnamed: 0,Campaign_ID,Target_Audience,Campaign_Goal,Duration,Channel_Used,Conversion_Rate,Acquisition_Cost,ROI,Location,Language,Clicks,Impressions,Engagement_Score,Customer_Segment,Date,Company
0,529013,Men 35-44,Product Launch,15 Days,Instagram,0.15,$500.00,5.79,Las Vegas,Spanish,500,3000,7,Health,2022-02-25,Aura Align
1,275352,Women 45-60,Market Expansion,15 Days,Facebook,0.01,$500.00,7.21,Los Angeles,French,500,3000,5,Home,2022-05-12,Hearth Harmony
2,692322,Men 45-60,Product Launch,15 Days,Instagram,0.08,$500.00,0.43,Austin,Spanish,500,3000,9,Technology,2022-06-19,Cyber Circuit
3,675757,Men 25-34,Increase Sales,15 Days,Pinterest,0.03,$500.00,0.909824,Miami,Spanish,293,1937,1,Health,2022-09-08,Well Wish
4,535900,Men 45-60,Market Expansion,15 Days,Pinterest,0.13,$500.00,1.422828,Austin,French,293,1937,1,Home,2022-08-24,Hearth Harmony


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Campaign_ID       300000 non-null  int64  
 1   Target_Audience   300000 non-null  object 
 2   Campaign_Goal     300000 non-null  object 
 3   Duration          300000 non-null  object 
 4   Channel_Used      300000 non-null  object 
 5   Conversion_Rate   300000 non-null  float64
 6   Acquisition_Cost  300000 non-null  object 
 7   ROI               300000 non-null  float64
 8   Location          300000 non-null  object 
 9   Language          300000 non-null  object 
 10  Clicks            300000 non-null  int64  
 11  Impressions       300000 non-null  int64  
 12  Engagement_Score  300000 non-null  int64  
 13  Customer_Segment  300000 non-null  object 
 14  Date              300000 non-null  object 
 15  Company           300000 non-null  object 
dtypes: float64(2), int64

In [5]:
df = df.drop("Language", axis=1)
df = df.drop("Company", axis=1)
df = df.drop("Campaign_ID", axis=1)
df = df.drop("Date", axis=1)
df = df[df['Target_Audience'] != "All Ages"]
df = df.dropna()

In [6]:
def to_int(string):
    numeros = re.findall(r'\d+', string)
    return int(numeros[0]) if numeros else 0

In [7]:
df['Duration'] = df['Duration'].apply(to_int)

In [8]:
df['Acquisition_Cost'] = df['Acquisition_Cost'].apply(to_int)

In [9]:
df['ROI'] = df['ROI'].apply(round, args=(2,))

In [10]:
df[['Gender', 'Age_range']] = df['Target_Audience'].str.split(expand=True)
df[['Age_min', 'Age_max']] = df['Age_range'].str.split("-", expand=True)
df = df.drop('Age_range', axis=1)
df = df.drop('Target_Audience', axis=1)

In [11]:
le = LabelEncoder()

In [12]:
df['Channel_Used'] = le.fit_transform(df['Channel_Used'])

In [13]:
gender_dummies = pd.get_dummies(df['Gender'], dtype=int)
campaign_dummies = pd.get_dummies(df['Campaign_Goal'], dtype=int)
location_dummies = pd.get_dummies(df['Location'], dtype=int)
segment_dummies = pd.get_dummies(df['Customer_Segment'], dtype=int)

In [14]:
df = pd.concat([df, gender_dummies], axis=1)
df = pd.concat([df, campaign_dummies], axis=1)
df = pd.concat([df, location_dummies], axis=1)
df = pd.concat([df, segment_dummies], axis=1)

In [15]:
df = df.drop('Gender', axis=1)
df = df.drop('Campaign_Goal', axis=1)
df = df.drop('Location', axis=1)
df = df.drop('Customer_Segment', axis=1)

In [16]:
df.head()

Unnamed: 0,Duration,Channel_Used,Conversion_Rate,Acquisition_Cost,ROI,Clicks,Impressions,Engagement_Score,Age_min,Age_max,...,Austin,Las Vegas,Los Angeles,Miami,New York,Fashion,Food,Health,Home,Technology
0,15,1,0.15,500,5.79,500,3000,7,35,44,...,0,1,0,0,0,0,0,1,0,0
1,15,0,0.01,500,7.21,500,3000,5,45,60,...,0,0,1,0,0,0,0,0,1,0
2,15,1,0.08,500,0.43,500,3000,9,45,60,...,1,0,0,0,0,0,0,0,0,1
3,15,2,0.03,500,0.91,293,1937,1,25,34,...,0,0,0,1,0,0,0,1,0,0
4,15,2,0.13,500,1.42,293,1937,1,45,60,...,1,0,0,0,0,0,0,0,1,0


Estarei aqui confirmando se há algum tipo de overfitting para cada tipo de canal usado "Channel used"

In [17]:
df_0 = df[df['Channel_Used'] == 0]
df_0

Unnamed: 0,Duration,Channel_Used,Conversion_Rate,Acquisition_Cost,ROI,Clicks,Impressions,Engagement_Score,Age_min,Age_max,...,Austin,Las Vegas,Los Angeles,Miami,New York,Fashion,Food,Health,Home,Technology
1,15,0,0.01,500,7.21,500,3000,5,45,60,...,0,0,1,0,0,0,0,0,1,0
5,15,0,0.02,500,6.90,500,3001,10,35,44,...,1,0,0,0,0,0,0,0,0,1
9,15,0,0.04,500,1.65,501,3003,9,45,60,...,0,0,0,1,0,0,1,0,0,0
14,15,0,0.13,500,0.89,501,3005,6,25,34,...,1,0,0,0,0,0,0,1,0,0
17,15,0,0.05,500,2.16,502,3006,8,25,34,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299963,60,0,0.03,15000,0.30,39995,119985,3,25,34,...,0,1,0,0,0,0,0,1,0,0
299983,60,0,0.06,15000,5.38,39997,119993,2,45,60,...,0,0,0,1,0,0,0,0,0,1
299989,60,0,0.14,15000,3.05,39998,119996,9,25,34,...,0,0,0,0,1,0,0,0,1,0
299991,60,0,0.06,15000,6.54,39998,119996,3,25,34,...,0,0,0,1,0,0,0,1,0,0


In [18]:
df_1 = df[df['Channel_Used'] == 1]
df_1

Unnamed: 0,Duration,Channel_Used,Conversion_Rate,Acquisition_Cost,ROI,Clicks,Impressions,Engagement_Score,Age_min,Age_max,...,Austin,Las Vegas,Los Angeles,Miami,New York,Fashion,Food,Health,Home,Technology
0,15,1,0.15,500,5.79,500,3000,7,35,44,...,0,1,0,0,0,0,0,1,0,0
2,15,1,0.08,500,0.43,500,3000,9,45,60,...,1,0,0,0,0,0,0,0,0,1
10,15,1,0.08,500,2.81,501,3003,8,45,60,...,0,1,0,0,0,0,0,0,0,1
13,15,1,0.09,500,0.94,501,3005,8,18,24,...,0,1,0,0,0,0,0,0,0,1
15,15,1,0.06,500,7.32,501,3005,4,25,34,...,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299987,60,1,0.14,15000,2.09,39998,119995,3,45,60,...,0,0,0,1,0,0,0,1,0,0
299992,60,1,0.09,15000,4.25,39999,119997,2,18,24,...,0,0,0,0,1,0,0,0,1,0
299995,60,1,0.14,15000,4.95,39999,119998,7,25,34,...,0,0,1,0,0,0,0,0,0,1
299996,60,1,0.08,15000,3.41,39999,119998,8,25,34,...,1,0,0,0,0,0,0,0,0,1


In [19]:
df_2 = df[df['Channel_Used'] == 2]
df_2

Unnamed: 0,Duration,Channel_Used,Conversion_Rate,Acquisition_Cost,ROI,Clicks,Impressions,Engagement_Score,Age_min,Age_max,...,Austin,Las Vegas,Los Angeles,Miami,New York,Fashion,Food,Health,Home,Technology
3,15,2,0.03,500,0.91,293,1937,1,25,34,...,0,0,0,1,0,0,0,1,0,0
4,15,2,0.13,500,1.42,293,1937,1,45,60,...,1,0,0,0,0,0,0,0,1,0
7,15,2,0.10,500,1.01,293,1938,1,25,34,...,0,0,0,1,0,0,1,0,0,0
29,15,2,0.03,500,0.32,295,1944,1,45,60,...,0,1,0,0,0,1,0,0,0,0
30,15,2,0.04,500,0.94,295,1944,1,35,44,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299966,60,2,0.13,15000,0.64,23490,77468,1,25,34,...,0,0,0,1,0,0,1,0,0,0
299974,60,2,0.06,15000,1.04,23490,77470,1,35,44,...,0,0,0,1,0,0,0,0,0,1
299976,60,2,0.02,15000,1.35,23490,77471,1,35,44,...,0,0,0,0,1,1,0,0,0,0
299994,60,2,0.11,15000,0.48,23492,77475,1,45,60,...,0,0,0,1,0,0,0,1,0,0


In [20]:
df_3 = df[df['Channel_Used'] == 3]
df_3

Unnamed: 0,Duration,Channel_Used,Conversion_Rate,Acquisition_Cost,ROI,Clicks,Impressions,Engagement_Score,Age_min,Age_max,...,Austin,Las Vegas,Los Angeles,Miami,New York,Fashion,Food,Health,Home,Technology
8,15,3,0.14,500,1.19,501,3003,8,35,44,...,0,0,1,0,0,0,1,0,0,0
11,15,3,0.05,500,5.02,501,3004,8,25,34,...,1,0,0,0,0,1,0,0,0,0
16,15,3,0.03,500,4.08,502,3006,10,18,24,...,0,0,1,0,0,0,0,1,0,0
18,15,3,0.02,500,6.82,502,3007,9,18,24,...,0,0,0,0,1,0,0,0,1,0
19,15,3,0.04,500,5.26,502,3007,10,25,34,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299985,60,3,0.05,15000,2.25,39998,119994,5,45,60,...,0,0,1,0,0,0,0,1,0,0
299986,60,3,0.09,15000,4.92,39998,119994,1,35,44,...,0,0,0,0,1,0,1,0,0,0
299988,60,3,0.09,15000,1.17,39998,119995,1,45,60,...,0,0,0,0,1,0,0,0,1,0
299990,60,3,0.11,15000,2.58,39998,119996,10,45,60,...,0,0,1,0,0,1,0,0,0,0


Separação dos dados para treinamento do modelo

In [21]:
X = df.drop(['Channel_Used', 'ROI', 'Conversion_Rate', 'Engagement_Score'], axis=1)
y = df['Channel_Used']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=14)

# ML utilizando Random Forest Classifier

In [23]:
rfc = RandomForestClassifier()

In [24]:
rfc.fit(X_train, y_train)

### Métrica

In [25]:
y_pred_rfc = rfc.predict(X_test)
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.33      0.32      0.33     16757
           1       0.33      0.33      0.33     16638
           2       0.98      0.98      0.98     16680
           3       0.32      0.32      0.32     16564

    accuracy                           0.49     66639
   macro avg       0.49      0.49      0.49     66639
weighted avg       0.49      0.49      0.49     66639

