# Projeto Big Data & Data Mining

## Consumo de propagandas: impacto dos anúncios na decisão de compra dos consumidores

### Imports

In [1]:
import pandas as pd
import numpy as np

### Armazenamento

In [2]:
df = pd.read_csv("../Data/df_Social_Media_Tratado.csv", parse_dates=["Date"])
print("Dataset carregado com shape:", df.shape)

Dataset carregado com shape: (300000, 18)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Campaign_ID       300000 non-null  int64         
 1   Campaign_Goal     300000 non-null  object        
 2   Duration_in_Days  300000 non-null  int64         
 3   Channel_Used      300000 non-null  object        
 4   Conversion_Rate   300000 non-null  float64       
 5   Acquisition_Cost  300000 non-null  object        
 6   ROI               300000 non-null  float64       
 7   Location          300000 non-null  object        
 8   Language          300000 non-null  object        
 9   Clicks            300000 non-null  int64         
 10  Impressions       300000 non-null  int64         
 11  Engagement_Score  300000 non-null  int64         
 12  Customer_Segment  300000 non-null  object        
 13  Date              300000 non-null  datetime64[ns]
 14  Comp

In [4]:
df.head(20)

Unnamed: 0,Campaign_ID,Campaign_Goal,Duration_in_Days,Channel_Used,Conversion_Rate,Acquisition_Cost,ROI,Location,Language,Clicks,Impressions,Engagement_Score,Customer_Segment,Date,Company,_Target_Audience,Age_Min,Age_Max
0,529013,Product Launch,15,Instagram,0.15,$500.00,5.79,Las Vegas,Spanish,500,3000,7,Health,2022-02-25,Aura Align,Men,35,44
1,275352,Market Expansion,15,Facebook,0.01,$500.00,7.21,Los Angeles,French,500,3000,5,Home,2022-05-12,Hearth Harmony,Women,45,60
2,692322,Product Launch,15,Instagram,0.08,$500.00,0.43,Austin,Spanish,500,3000,9,Technology,2022-06-19,Cyber Circuit,Men,45,60
3,675757,Increase Sales,15,Pinterest,0.03,$500.00,0.909824,Miami,Spanish,293,1937,1,Health,2022-09-08,Well Wish,Men,25,34
4,535900,Market Expansion,15,Pinterest,0.13,$500.00,1.422828,Austin,French,293,1937,1,Home,2022-08-24,Hearth Harmony,Men,45,60
5,323031,Product Launch,15,Facebook,0.02,$500.00,6.9,Austin,Spanish,500,3001,10,Technology,2022-01-15,Cyber Circuit,Women,35,44
6,727501,Increase Sales,15,Pinterest,0.1,$500.00,0.67924,Los Angeles,French,293,1938,1,Home,2022-10-30,Space Spruce,All Ages,18,60
7,289553,Product Launch,15,Pinterest,0.1,$500.00,1.009922,Miami,English,293,1938,1,Food,2022-07-23,Feast Flavors,Men,25,34
8,942511,Market Expansion,15,Twitter,0.14,$500.00,1.19,Los Angeles,French,501,3003,8,Food,2022-08-14,Culinary Quest,Men,35,44
9,255854,Increase Sales,15,Facebook,0.04,$500.00,1.65,Miami,English,501,3003,9,Food,2022-03-23,Gourmet Grove,Women,45,60


### Análise

#### Script

In [5]:
# 1. Prever Retorno Sobre Investimento (ROI)
# 2. Prever A Taxa de Conversão
# 3. Prever o canal de marketing que melhor teve performance (talvez o top 20 ou 30%)

#### 01. ROI

Seleção de variáveis

In [6]:
target = "ROI"

features = [
    "Duration_in_Days",
    "Conversion_Rate",
    "Clicks",
    "Impressions",
    "Engagement_Score",
    "Age_Min",
    "Age_Max",
    "Channel_Used",
    "Customer_Segment",
    "Language"
]

In [7]:
df_model = df[features + [target]].copy()

In [8]:
df_model = pd.get_dummies(df_model, drop_first=True)
print("Shape após dummies:", df_model.shape)

Shape após dummies: (300000, 17)


Treinamento do modelo

In [9]:
X = df_model.drop(columns=[target])
y = df_model[target]

In [10]:
import sys
print(sys.executable)

!{sys.executable} -m pip --version
!{sys.executable} -m pip install scikit-learn

c:\Users\Cleitin\BDDM\Consumo_propagandas\venv\Scripts\python.exe
pip 25.3 from c:\Users\Cleitin\BDDM\Consumo_propagandas\venv\Lib\site-packages\pip (python 3.13)



In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.25, 
    random_state=42
)

In [12]:
print("Treino:", X_train.shape, " | Teste:", X_test.shape)

Treino: (225000, 16)  | Teste: (75000, 16)


Treino do Modelo

In [13]:
from sklearn.linear_model import LinearRegression
modelo = LinearRegression()
modelo.fit(X_train, y_train)
print("Sucesso!!!")

Sucesso!!!


In [14]:
y_pred = modelo.predict(X_test)

In [15]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n==== MÉTRICAS DO MODELO ====")
print("MSE :", mse)
print("RMSE:", np.sqrt(mse))
print("R²  :", r2)


==== MÉTRICAS DO MODELO ====
MSE : 4.04061555616764
RMSE: 2.0101282437117387
R²  : 0.33130995816040487


In [16]:
coef = pd.DataFrame({
    "variavel": X.columns,
    "coeficiente": modelo.coef_
})

print("\nCoeficientes ordenados:")
print(coef.sort_values("coeficiente", ascending=False).head(15))


Coeficientes ordenados:
                       variavel   coeficiente
15             Language_Spanish  2.906368e-02
7        Channel_Used_Instagram  2.663886e-02
14              Language_French  2.634811e-02
9          Channel_Used_Twitter  1.798408e-02
13  Customer_Segment_Technology  1.616180e-02
11      Customer_Segment_Health  1.566556e-02
12        Customer_Segment_Home  1.387758e-02
10        Customer_Segment_Food  6.133340e-03
0              Duration_in_Days  4.654058e-04
6                       Age_Max  4.364792e-04
2                        Clicks  2.027028e-06
3                   Impressions -9.883280e-07
4              Engagement_Score -6.419417e-04
5                       Age_Min -9.386570e-04
1               Conversion_Rate -3.918335e-02
