In [None]:
%load_ext autoreload
%autoreload 2

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import logging
from google.cloud import bigquery


In [None]:
# Retrieve UCI Bank Marketing dataset from Kaggle
from kaggle_api import download_bank_marketing_dataset
df = download_bank_marketing_dataset()

In [None]:
# Save the UCI dataset in the raw folder
uci_bank_marketing = df.copy()
uci_bank_marketing.to_csv("../data/raw/uci_bank_marketing.csv", index=False)



In [None]:
uci_bank_marketing = pd.read_csv("../data/raw/uci_bank_marketing.csv", sep=",")
uci_bank_marketing

### 1.1. Explore and clean the UCI dataset

In [None]:
from functions import explore_dataset
explore_dataset(uci_bank_marketing)

In [None]:
# Reconstruct Campaign Timeline
from functions import add_year_from_month_sequence
uci_bank_marketing = add_year_from_month_sequence(uci_bank_marketing)
uci_bank_marketing

In [None]:
# clean UCI dataset
from functions import clean_uci_dataset
uci_bank_marketing_cleaned = clean_uci_dataset(uci_bank_marketing)
uci_bank_marketing_cleaned

In [None]:
##ecb_interest_rates = pd.read_csv("../data/raw/ecb_bank_interest_rates.csv", sep=",")
#ecb_interest_rates.info()

## Web Scrapping 

In [None]:
# Create Campaign Dimension Table
from functions import generate_campaign_table
campaigns_df = generate_campaign_table(uci_bank_marketing_cleaned)
campaigns_df.head()

In [None]:
logging.basicConfig(level=logging.INFO)

from scraping import (
    scrape_boursorama_campaign_metadata,
    scrape_nickel_campaign_metadata,
)
boursorama_df = scrape_boursorama_campaign_metadata()
nickel_df = scrape_nickel_campaign_metadata()

scraped_df = pd.concat([boursorama_df, nickel_df], ignore_index=True)
scraped_df.head()

In [None]:
# Freeze Scraped Data to prevents “moving target” argument
scraped_df.to_csv("campaign_metadata_external.csv", index=False)

In [None]:
# Create the dataframe (make sure the function name matches yours)
campaign_dim = generate_campaign_table(uci_bank_marketing_cleaned)

# Drop the empty/placeholder channel column from campaign_dim before merging
if 'campaign_channel' in campaign_dim.columns:
    campaign_dim = campaign_dim.drop(columns=['campaign_channel'])

# Perform the merge
campaign_dim = campaign_dim.merge(
    scraped_df[["campaign_name", "campaign_channel"]],
    on="campaign_name",
    how="left"
)

# Now 'campaign_channel' exists without a suffix!
campaign_dim["campaign_channel"] = campaign_dim["campaign_channel"].fillna("Unknown")

In [None]:
# clean Campaigns dataset
from functions import clean_campaign_data
campaign_dim_cleaned = clean_campaign_data(campaign_dim)
campaign_dim_cleaned.head()

### ECB file

In [None]:
# Upload ECB file
from ecb_download_function import download_ecb_to_csv
ecb_interest_rates = download_ecb_to_csv()
ecb_interest_rates

In [None]:
# load ECB as a Flat File
ecb_interest_rates = pd.read_csv("../data/raw/ecb_market_rates.csv")
ecb_interest_rates.head()

In [None]:
from functions import clean_ecb_market_data
ecb_interest_rates_cleaned = clean_ecb_market_data(ecb_interest_rates)
print(ecb_interest_rates_cleaned.head())


In [None]:
# Group by Year and Month, and calculate the MEAN interest rate for that month
ecb_monthly_agg = ecb_interest_rates_cleaned.groupby(['year', 'month_num'])['ecb_rate'].mean().reset_index()

print(f"Original ECB Rows: {ecb_interest_rates_cleaned.shape[0]}")
print(f"Aggregated ECB Rows: {ecb_monthly_agg.shape[0]}") 



In [None]:
# Extract the missing 'year' and 'month_num' columns
campaign_dim_cleaned['year'] = campaign_dim_cleaned['campaign_start_date'].dt.year
campaign_dim_cleaned['month_num'] = campaign_dim_cleaned['campaign_start_date'].dt.month



In [None]:
# Aggregate to avoid duplicates (One channel per month)
campaign_monthly_agg = campaign_dim_cleaned.groupby(['year', 'month_num'])['campaign_channel'].first().reset_index()


campaign_monthly_agg.head()


In [None]:
# 1. Merge UCI with AGGREGATED ECB Rates
merged_df = uci_bank_marketing_cleaned.merge(
    ecb_monthly_agg,  # Use the aggregated version!
    on=['year', 'month_num'], 
    how='left'
)

merged_df.head()

In [None]:
# 2. Merge with Campaign Dimensions
final_master_df = merged_df.merge(
    campaign_monthly_agg,
    on=['year', 'month_num'],
    how='left'
)

final_master_df.head()

In [None]:
# --- VALIDATION ---
print(f"Final Dataset Shape: {final_master_df.shape}")
# TARGET: This should now be exactly (45211, 21) or very close to it.

In [None]:
# 3. Validation Checks
print(f"Final Dataset Shape: {final_master_df.shape}")
print(f"Missing ECB Rates: {final_master_df['ecb_rate'].isna().sum()}")
print(f"Missing Campaign Channels: {final_master_df['campaign_channel'].isna().sum()}")

# View the enriched record
final_master_df.head()

In [None]:
from functions import explore_dataset
explore_dataset(final_master_df)

### 1.1. Explore and clean the UCI dataset

In [None]:

print("Columns in scraped_df:", scraped_df.columns.tolist())
print("Columns in campaign_dim_cleaned :", campaign_dim_cleaned.columns.tolist())
print("Columns in UCI after the cleaning:", uci_bank_marketing_cleaned.columns.tolist())
print("Columns in ecb_interest_rates_cleaned:", ecb_interest_rates_cleaned.columns.tolist())
print("Columns in final_master after merged:", final_master_df.columns.tolist())



In [None]:
# Freeze Scraped Data to prevents “moving target” argument
uci_bank_marketing_cleaned.to_csv("../data/clean/uci_bank_marketing_cleaned.csv", index=False)
campaign_dim_cleaned.to_csv("../data/clean/campaign_dim_cleaned.csv", index=False)
ecb_interest_rates_cleaned.to_csv("../data/clean/ecb_interest_rates_cleaned.csv", index=False)
final_master_df.to_csv("../data/clean/ Marketing_Campaign_final.csv", index=False)