In [16]:
import pandas as pd
import zipfile
import os
from pathlib import Path

# Directorios de entrada y salida
input_dir = Path("../files/input/")
output_dir = Path("../files/output/")
output_dir.mkdir(parents=True, exist_ok=True)

# Archivos de salida
client_file = output_dir / "client.csv"
campaign_file = output_dir / "campaign.csv"
economics_file = output_dir / "economics.csv"

# Funciones auxiliares
def clean_job(job):
    return job.replace(".", "").replace("-", "_")

def clean_education(education):
    return education.replace(".", "_") if education != "unknown" else pd.NA

def to_binary(value, true_value="yes"):
    return 1 if value == true_value else 0

def format_last_contact_day(day, month):
    month_map = {
        "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
        "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12
    }
    return f"2022-{month_map[month.lower()]:02d}-{int(day):02d}"

# Proceso principal
all_data = []

for zip_file in input_dir.glob("*.csv.zip"):
    with zipfile.ZipFile(zip_file, 'r') as z:
        for file_name in z.namelist():
            with z.open(file_name) as f:
                df = pd.read_csv(f)
                all_data.append(df)

# Concatenar todos los DataFrames
combined_data = pd.concat(all_data, ignore_index=True)

combined_data

# Crear y guardar client.csv
client_df = combined_data[[
    "client_id", "age", "job", "marital", "education", "credit_default", "mortgage"
]].copy()

client_df["job"] = client_df["job"].apply(clean_job)
client_df["education"] = client_df["education"].apply(clean_education)
client_df["credit_default"] = client_df["credit_default"].apply(to_binary)
client_df["mortgage"] = client_df["mortgage"].apply(to_binary)

client_df.to_csv(client_file, index=False)

# Crear y guardar campaign.csv
campaign_df = combined_data[[
    "client_id", "number_contacts", "contact_duration",
    "previous_campaign_contacts", "previous_outcome", 
    "campaign_outcome", "day", "month"
]].copy()

campaign_df["previous_outcome"] = campaign_df["previous_outcome"].apply(lambda x: 1 if x == "success" else 0)
campaign_df["campaign_outcome"] = campaign_df["campaign_outcome"].apply(lambda x: 1 if x == "yes" else 0)
campaign_df["last_contact_day"] = campaign_df.apply(
    lambda row: format_last_contact_day(row["day"], row["month"]), axis=1
)

campaign_df = campaign_df.drop(columns=["day", "month"])
campaign_df.to_csv(campaign_file, index=False)

# Crear y guardar economics.csv
economics_df = combined_data[[
    "client_id", "cons_price_idx", "euribor_three_months"
]].copy()

economics_df.to_csv(economics_file, index=False)

print("Archivos generados correctamente en la carpeta files/output/")

Archivos generados correctamente en la carpeta files/output/


In [8]:
import pandas as pd
import zipfile
import os
from pathlib import Path

# Directorios de entrada y salida
input_dir = Path("../files/input/")
output_dir = Path("../files/output/")
output_dir.mkdir(parents=True, exist_ok=True)

# Archivos de salida
client_file = output_dir / "client.csv"
campaign_file = output_dir / "campaign.csv"
economics_file = output_dir / "economics.csv"

# Funciones auxiliares
def clean_job(job):
    return job.replace(".", "").replace("-", "_")

def clean_education(education):
    return education.replace(".", "_") if education != "unknown" else pd.NA

def to_binary(value, true_value="yes"):
    return 1 if value == true_value else 0

def format_last_contact_day(day, month):
    month_map = {
        "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
        "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12
    }
    return f"2022-{month_map[month.lower()]:02d}-{int(day):02d}"

# Proceso principal
all_data = []

for zip_file in input_dir.glob("*.csv.zip"):
    with zipfile.ZipFile(zip_file, 'r') as z:
        for file_name in z.namelist():
            with z.open(file_name) as f:
                df = pd.read_csv(f)
                all_data.append(df)

# Concatenar todos los DataFrames
combined_data = pd.concat(all_data, ignore_index=True)

combined_data


Unnamed: 0.1,Unnamed: 0,client_id,age,job,marital,education,credit_default,mortgage,month,day,contact_duration,number_contacts,previous_campaign_contacts,previous_outcome,cons_price_idx,euribor_three_months,campaign_outcome
0,0,0,56,housemaid,married,basic.4y,no,no,may,13,261,1,0,nonexistent,93.994,4.857,no
1,1,1,57,services,married,high.school,unknown,no,may,19,149,1,0,nonexistent,93.994,4.857,no
2,2,2,37,services,married,high.school,no,yes,may,23,226,1,0,nonexistent,93.994,4.857,no
3,3,3,40,admin.,married,basic.6y,no,no,may,27,151,1,0,nonexistent,93.994,4.857,no
4,4,4,56,services,married,high.school,no,no,may,3,307,1,0,nonexistent,93.994,4.857,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,4113,41183,73,retired,married,professional.course,no,yes,nov,30,334,1,0,nonexistent,94.767,1.028,yes
41184,4114,41184,46,blue-collar,married,professional.course,no,no,nov,6,383,1,0,nonexistent,94.767,1.028,no
41185,4115,41185,56,retired,married,university.degree,no,yes,nov,24,189,2,0,nonexistent,94.767,1.028,no
41186,4116,41186,44,technician,married,professional.course,no,no,nov,17,442,1,0,nonexistent,94.767,1.028,yes


In [14]:
import pandas as pd
import zipfile
import os
from pathlib import Path

# Directorios de entrada y salida
input_dir = Path("../files/input/")
output_dir = Path("../files/output/")
output_dir.mkdir(parents=True, exist_ok=True)

# Archivos de salida
client_file = output_dir / "client.csv"
campaign_file = output_dir / "campaign.csv"
economics_file = output_dir / "economics.csv"

# Funciones auxiliares
def clean_job(job):
    return job.replace(".", "").replace("-", "_")

def clean_education(education):
    return education.replace(".", "_") if education != "unknown" else pd.NA

def to_binary(value, true_value="yes"):
    return 1 if value == true_value else 0

def format_last_contact_day(day, month):
    month_map = {
        "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
        "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12
    }
    return f"2022-{month_map[month.lower()]:02d}-{int(day):02d}"

# Proceso principal
all_data = []

for zip_file in input_dir.glob("*.csv.zip"):
    with zipfile.ZipFile(zip_file, 'r') as z:
        for file_name in z.namelist():
            with z.open(file_name) as f:
                df = pd.read_csv(f)
                all_data.append(df)

# Concatenar todos los DataFrames
combined_data = pd.concat(all_data, ignore_index=True)

combined_data

# Crear y guardar client.csv
client_df = combined_data[[
    "client_id", "age", "job", "marital", "education", "credit_default", "mortgage"
]].copy()

client_df["job"] = client_df["job"].apply(clean_job)
client_df["education"] = client_df["education"].apply(clean_education)
client_df["credit_default"] = client_df["credit_default"].apply(to_binary)
client_df["mortgage"] = client_df["mortgage"].apply(to_binary)

client_df.to_csv(client_file, index=False)

# Crear y guardar campaign.csv
campaign_df = combined_data[[
    "client_id", "number_contacts", "contact_duration",
    "previous_campaign_contacts", "previous_outcome", 
    "campaign_outcome", "day", "month"
]].copy()

campaign_df

Unnamed: 0,client_id,number_contacts,contact_duration,previous_campaign_contacts,previous_outcome,campaign_outcome,day,month
0,0,1,261,0,nonexistent,no,13,may
1,1,1,149,0,nonexistent,no,19,may
2,2,1,226,0,nonexistent,no,23,may
3,3,1,151,0,nonexistent,no,27,may
4,4,1,307,0,nonexistent,no,3,may
...,...,...,...,...,...,...,...,...
41183,41183,1,334,0,nonexistent,yes,30,nov
41184,41184,1,383,0,nonexistent,no,6,nov
41185,41185,2,189,0,nonexistent,no,24,nov
41186,41186,1,442,0,nonexistent,yes,17,nov
