In [1]:
import re
import time
import random
from datetime import datetime, timedelta

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")
from data import campaign, engagement

# Helper Functions


In [2]:
def to_lowercase(df):
	df = df.copy()
	df.rename({i:i.lower() for i in df.columns.values}, axis=1, inplace=True)
	return df

def to_snakecase(df):
	snakecase = {i: re.sub(r"[,.;@#?!&$]+\ *", "", i.strip()).replace(" ", "_") for i in df}
	df.rename(columns=snakecase, inplace=True)
	return df
	
def get_users(path="data/users.csv", date_format = "%Y-%m"):
	users = to_snakecase(to_lowercase(pd.read_csv(path)))
	users['user'] = users['user'].astype(str).str.pad(width=4, side='left', fillchar='0')
	users['birth_year'] = users['birth_year'].astype(str) + '-' + users['birth_month'].astype(str)
	users['birth_year'] = pd.to_datetime(users['birth_year'], format=date_format)
	users = users.drop(columns= ['birth_month'])
	users = users.rename(columns={'user':'customer_id', 'birth_year': 'birth_year_month'})
	return users


def get_creditcards(path="data/credit_cards.csv", date_format = "%m/%Y"):
	credit_cards = to_snakecase(to_lowercase(pd.read_csv("../../../data/credit_cards.csv")))
	credit_cards['user'] = credit_cards['user'].astype(str).str.pad(width=4, side='left', fillchar='0')
	credit_cards['expires'] = pd.to_datetime(credit_cards['expires'], format=date_format)
	credit_cards['acct_open_date'] = pd.to_datetime(credit_cards['expires'], format=date_format)
	credit_cards['year_pin_last_changed'] = pd.to_datetime(credit_cards['year_pin_last_changed'], format="%Y")
	return credit_cards

def get_transactions(path="data/transactions.csv", date_format = "%m/%Y"):
	transactions = to_snakecase(to_lowercase(pd.read_csv(path)))
	transactions.insert(0, 'identifier', transactions.index + 1) 
	transactions['user'] = transactions['user'].astype(str).str.pad(width=4, side='left', fillchar='0')
	transactions = transactions.rename(columns={'card':'card_index'})
	hour_min = transactions['time'].str.split(":", expand=True).rename(columns={0:'hour', 1:'minute'})
	transactions = pd.concat([transactions, hour_min], axis=1)

	date_cols = ['year', 'month', 'day', 'hour', 'minute']
	transactions['date'] = pd.to_datetime(transactions[date_cols])

	cc_no = get_creditcards()[['user', 'card_index', 'card_number']]
	card_no = transactions.merge(cc_no, how='inner', on=['user', 'card_index'])['card_number'].astype(str).str.pad(width=4, side='right', fillchar='0')
	transactions.insert(1, 'card_number', card_no) 
	transactions = transactions.drop(columns= ['card_index', 'time'] + date_cols)
	transactions = transactions.rename(columns={'user':'customer_id'})
	return transactions

def get_churn(users, 
        start_date = datetime(2023, 1, 1),
        end_date = datetime(2024, 10, 31)
    ):
    customer_ids = users["customer_id"].tolist()
    # Helper function to generate random churn date
    def random_churn_date():
        return start_date + timedelta(days=random.randint(0, (end_date - start_date).days))

    # Generate churn data
    churn_data = {
        "customer_id": [],
        # "has_churned": [],
        "churn_date": []
    }

    for customer_id in customer_ids:
        has_churned = random.random() < 0.2  # 10% churn rate
        if has_churned:
            churn_data["customer_id"].append(customer_id) 
            # churn_data["has_churned"].append(has_churned)
            churn_data["churn_date"].append(random_churn_date())

    # Create DataFrame
    churn = pd.DataFrame(churn_data)
    return churn

def get_Products(path="data/recodataset.csv", 
				column_mapping = {
					"fecha_dato": "report_date",
					"ncodpers": "customer_id",
					"ind_empleado": "employee_index",
					"pais_residencia": "country_residence",
					"sexo": "gender",
					"age": "age",
					"fecha_alta": "contract_start_date",
					"ind_nuevo": "new_customer_index",
					"antiguedad": "seniority_months",
					"indrel": "primary_customer_status",
					"ult_fec_cli_1t": "last_primary_customer_date",
					"indrel_1mes": "customer_type_start_month",
					"tiprel_1mes": "customer_relation_type",
					"indresi": "residence_index",
					"indext": "foreigner_index",
					"conyuemp": "spouse_employee_index",
					"canal_entrada": "join_channel",
					"indfall": "deceased_index",
					"tipodom": "address_type",
					"cod_prov": "province_code",
					"nomprov": "province_name",
					"ind_actividad_cliente": "activity_index",
					"renta": "gross_income",
					"segmento": "customer_segment",
					"ind_ahor_fin_ult1": "saving_account",
					"ind_aval_fin_ult1": "guarantee",
					"ind_cco_fin_ult1": "current_account",
					"ind_cder_fin_ult1": "derivada_account",
					"ind_cno_fin_ult1": "payroll_account",
					"ind_ctju_fin_ult1": "junior_account",
					"ind_ctma_fin_ult1": "more_particular_account",
					"ind_ctop_fin_ult1": "particular_account",
					"ind_ctpp_fin_ult1": "particular_plus_account",
					"ind_deco_fin_ult1": "short_term_deposits",
					"ind_deme_fin_ult1": "medium_term_deposits",
					"ind_dela_fin_ult1": "long_term_deposits",
					"ind_ecue_fin_ult1": "e_account",
					"ind_fond_fin_ult1": "funds",
					"ind_hip_fin_ult1": "mortgage",
					"ind_plan_fin_ult1": "pensions",
					"ind_pres_fin_ult1": "loans",
					"ind_reca_fin_ult1": "taxes",
					"ind_tjcr_fin_ult1": "credit_card",
					"ind_valo_fin_ult1": "securities",
					"ind_viv_fin_ult1": "home_account",
					"ind_nomina_ult1": "payroll",
					"ind_nom_pens_ult1": "pensions_payments",
					"ind_recibo_ult1": "direct_debit"
				}
	):
	santender = pd.read_csv(path)
	santender = santender.rename(columns=column_mapping)
	santender['customer_id'] = santender['customer_id'].apply(lambda x: str(x).zfill(4))
	santender = to_snakecase(to_lowercase(santender))
	santender['report_date'] = pd.to_datetime(santender['report_date'])
	santender['contract_start_date'] = pd.to_datetime(santender['contract_start_date'])
	santender['last_primary_customer_date'] = pd.to_datetime(santender['last_primary_customer_date'])
	str_cols = santender.select_dtypes(include='object').columns
	santender[str_cols] = santender[str_cols].apply(lambda x: x.str.strip(), axis=1)
	santender[str_cols] = santender[str_cols].replace(regex=[r'NA'], value=None)
	return santender

users = get_users()
transactions = get_transactions()
churn = get_churn(users)
santender = get_Products()

# Connect to Database

In [3]:
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship, Mapped, mapped_column
from sqlalchemy import create_engine, MetaData, Column, Integer, String, Double, DateTime, ForeignKey
from sqlalchemy.dialects.mysql import LONGTEXT


def create_db(user="root", password="msql1234", server="localhost", database="transact"):
    SQLALCHEMY_DATABASE_URL = "mysql+pymysql://{}:{}@{}/{}".format(
        user, password, server, database
    )
    engine = create_engine(SQLALCHEMY_DATABASE_URL)

    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
    Base = declarative_base()

    return engine, SessionLocal, Base

engine, SessionLocal, Base = create_db()


# Schemas

In [4]:
class Users(Base):
	__tablename__ = 'users'
	customer_id = Column(String(10), primary_key=True, nullable=False)
	person = Column(String(32))
	current_age = Column(Integer)
	retirement_age = Column(Integer)
	birth_year_month = Column(DateTime)
	gender = Column(String(32))
	address = Column(String(64))
	apartment = Column(Integer)
	city = Column(String(32))
	state = Column(String(32))
	zipcode = Column(String(32))
	latitude = Column(Double)
	longitude = Column(Double)
	per_capita_income = Column(Double)
	yearly_income = Column(Double)
	total_debt = Column(Double)
	fico_score = Column(Double)
	num_credit_cards = Column(Integer)

class Transactions(Base):
	__tablename__ = "transactions"
	identifier = Column(Integer, primary_key=True, autoincrement=True)
	customer_id = Column(String(10), ForeignKey("users.customer_id", ondelete="CASCADE"), nullable=False)
	card_number = Column(String(16), nullable=False)
	date = Column(DateTime)
	amount = Column(Double)
	use_chip = Column(String(32))
	merchant_name = Column(String(32))
	merchant_city = Column(String(32))
	merchant_state = Column(String(32))
	zip = Column(String(16))
	mcc = Column(Integer)
	errors = Column(LONGTEXT)
	is_fraud = Column(String(3))

class Campaigns(Base):
	__tablename__ = "campaign"
	campaign_id = Column(String(32), primary_key=True, nullable=False)
	campaign_name = Column(String(32))
	start_date = Column(DateTime)
	end_date = Column(DateTime)
	target_segment = Column(String(32))
	budget = Column(Double)
	channel = Column(String(32))
	goal = Column(String(32))
	displays = Column(Integer)

class Engagement(Base): # Independent
	__tablename__ = "engagement"
	engagement_id = Column(String(10), primary_key=True)
	campaign_id = Column(String(32), ForeignKey("campaign.campaign_id", ondelete="CASCADE"), nullable=False)
	customer_id = Column(String(10), nullable=False)
	engagement_date = Column(DateTime)
	action_type = Column(String(32))
	device_type = Column(String(32))
	feedback_score = Column(Integer)
	conversion_value  = Column(Double)
	

class Churn(Base): # Independent
	__tablename__ = "churn"
	customer_id = Column(String(10), ForeignKey("users.customer_id", ondelete="CASCADE"), primary_key=True)
	churn_date = Column(DateTime)


class Product(Base):
	__tablename__ = 'santender'
	report_date = Column(DateTime)
	customer_id = Column(String(32), primary_key=True)
	employee_index =  Column(String(16))
	country_residence = Column(String(32))
	gender =  Column(String(16))
	age = Column(Integer)
	contract_start_date = Column(DateTime)
	new_customer_index = Column(Integer)
	seniority_months = Column(Integer)
	primary_customer_status = Column(Integer)
	last_primary_customer_date = Column(DateTime)
	customer_type_start_month = Column(Integer)
	customer_relation_type = Column(String(16))
	residence_index = Column(String(16))
	foreigner_index = Column(String(16))
	spouse_employee_index = Column(String(16))
	join_channel = Column(String(16))
	deceased_index = Column(String(16))
	address_type = Column(Integer)
	province_code = Column(Integer)
	province_name = Column(String(32))
	activity_index = Column(Integer)
	gross_income = Column(Double)
	customer_segment = Column(String(32))
	saving_account = Column(Integer)
	guarantee = Column(Integer) 
	current_account = Column(Integer)
	derivada_account = Column(Integer)
	payroll_account = Column(Integer)
	junior_account = Column(Integer)
	more_particular_account = Column(Integer)
	particular_account = Column(Integer)
	particular_plus_account = Column(Integer)
	short_term_deposits = Column(Integer)
	medium_term_deposits = Column(Integer)
	long_term_deposits = Column(Integer)
	e_account = Column(Integer)
	funds = Column(Integer)
	mortgage = Column(Integer)
	pensions = Column(Integer)
	loans = Column(Integer)
	taxes = Column(Integer)
	credit_card = Column(Integer)
	securities = Column(Integer)
	home_account = Column(Integer)
	payroll = Column(Integer)
	pensions_payments = Column(Integer)
	direct_debit = Column(Integer)


Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

# Insert into Database

In [5]:
with engine.connect() as db:
	dct = {'users': users, 'transactions':transactions,
		'churn':churn, 'campaign': campaign,
		'engagement': engagement,
		'santender': santender}
	for k,v in dct.items():
		try:
			v.to_sql(k, con=engine, if_exists='append', index=False)
			db.commit()
			print("{} Ok".format(k))
		except:
			db.rollback()
			print("{} Failed".format(k))
	db.close()

users Ok
transactions Ok
churn Ok
campaign Ok
engagement Ok
santender Ok


# Example on How to get data

In [6]:
import pandas as pd
import numpy as np

import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine


def create_db(user="root", password="msql1234", server="localhost", database="transact"):
    SQLALCHEMY_DATABASE_URL = "mysql+pymysql://{}:{}@{}/{}".format(
        user, password, server, database
    )
    engine = create_engine(SQLALCHEMY_DATABASE_URL)

    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
    Base = declarative_base()

    return engine, SessionLocal, Base

engine, SessionLocal, Base = create_db()

def get_data(query_string):
	with engine.connect() as db:
		fetched = pd.DataFrame(db.execute(query_string).fetchall())
		db.close()
	return fetched

## Get Raw Churn Data

In [7]:
query_string = sqlalchemy.text(
	"""
	SELECT * 
	FROM users u
	LEFT JOIN churn c
	ON u.customer_id = c.customer_id;
	"""
)
fetched = get_data(query_string)
fetched

Unnamed: 0,customer_id,person,current_age,retirement_age,birth_year_month,gender,address,apartment,city,state,zipcode,latitude,longitude,per_capita_income,yearly_income,total_debt,fico_score,num_credit_cards,customer_id.1,churn_date
0,0000,Hazel Robinson,53,66,1966-11-01,Female,462 Rose Lane,,La Verne,CA,91750,34.15,-117.76,29278.0,59696.0,127613.0,787.0,5,,NaT
1,0001,Sasha Sadr,53,68,1966-12-01,Female,3606 Federal Boulevard,,Little Neck,NY,11363,40.76,-73.74,37891.0,77254.0,191349.0,701.0,5,0001,2024-03-20
2,0002,Saanvi Lee,81,67,1938-11-01,Female,766 Third Drive,,West Covina,CA,91792,34.02,-117.89,22681.0,33483.0,196.0,698.0,5,,NaT
3,0003,Everlee Clark,63,63,1957-01-01,Female,3 Madison Street,,New York,NY,10069,40.71,-73.99,163145.0,249925.0,202328.0,722.0,4,,NaT
4,0004,Kyle Peterson,43,70,1976-09-01,Male,9620 Valley Stream Drive,,San Francisco,CA,94117,37.76,-122.44,53797.0,109687.0,183855.0,675.0,1,0004,2023-06-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,Jose Faraday,32,70,1987-07-01,Male,6577 Lexington Lane,9.0,Freeport,NY,11520,40.65,-73.58,23550.0,48010.0,87837.0,703.0,3,,NaT
1996,1996,Ximena Richardson,62,65,1957-11-01,Female,2 Elm Drive,955.0,Independence,KY,41051,38.95,-84.54,24218.0,49378.0,104480.0,740.0,4,,NaT
1997,1997,Annika Russell,47,67,1973-01-01,Female,276 Fifth Boulevard,,Elizabeth,NJ,7201,40.66,-74.19,15175.0,30942.0,71066.0,779.0,3,,NaT
1998,1998,Juelz Roman,66,60,1954-02-01,Male,259 Valley Boulevard,,Camp Hill,PA,17011,40.24,-76.92,25336.0,54654.0,27241.0,618.0,1,1998,2024-09-30


## Get Raw Transaction/RFM Data

In [8]:
query_string = sqlalchemy.text(
	"""
	SELECT u.*, t.amount, t.date
	FROM users u, transactions t
	WHERE u.customer_id = t.customer_id;
	"""
)
fetched = get_data(query_string)
fetched

Unnamed: 0,customer_id,person,current_age,retirement_age,birth_year_month,gender,address,apartment,city,state,zipcode,latitude,longitude,per_capita_income,yearly_income,total_debt,fico_score,num_credit_cards,amount,date
0,1781,Achilles Edwards,76,65,1943-07-01,Male,2012 Forest Avenue,,Hickory,NC,28602,35.73,-81.32,17850.0,21867.0,21103.0,759.0,2,28.40,2003-08-14 17:52:00
1,1457,Bryce Long,32,71,1988-01-01,Male,4725 North Street,,Toledo,OH,43608,41.66,-83.58,12101.0,24668.0,22338.0,756.0,1,41.97,2018-05-25 10:15:00
2,0252,Angie Cox,43,70,1976-05-01,Female,5194 Grant Street,6.0,Greenville,SC,29607,34.83,-82.37,24314.0,49577.0,142314.0,694.0,3,-75.00,2003-09-05 17:13:00
3,1329,Clay Murphy,50,64,1969-04-01,Male,4480 Hillside Avenue,6.0,Round Lake,IL,60073,42.34,-88.11,22578.0,46039.0,79738.0,672.0,4,58.62,2013-06-22 04:00:00
4,1234,Santana Masvidal,52,65,1967-03-01,Male,14813 El Camino Drive,,Eastpointe,MI,48021,42.46,-82.94,18487.0,37686.0,41173.0,739.0,4,23.81,2017-03-30 20:52:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0790,Eduardo Torres,42,73,1977-10-01,Male,663 Summit Boulevard,368.0,Brooklyn,NY,11210,40.64,-73.94,23316.0,47542.0,2667.0,725.0,3,2.83,2008-11-03 07:36:00
9996,0807,Paola Howard,78,67,1941-05-01,Female,245 Martin Luther King Drive,,Fort Mill,SC,29715,35.00,-80.94,26707.0,51014.0,23366.0,759.0,4,4.67,2006-08-15 08:34:00
9997,0067,Janiyah Foster,90,66,1929-06-01,Female,145 River Drive,,Allentown,PA,18102,40.59,-75.47,12427.0,19893.0,1712.0,566.0,3,-100.00,2012-05-04 15:44:00
9998,0058,Milani Merkel,46,59,1973-05-01,Female,524 Ocean Drive,87.0,Houston,TX,77056,29.76,-95.38,95039.0,193773.0,241571.0,660.0,1,91.89,2018-02-06 16:03:00


## Get Raw Engagement Data

In [9]:
query_string = sqlalchemy.text(
	"""
	SELECT c.*, e.customer_id, e.engagement_date, 
	e.action_type, e.device_type, e.feedback_score,
	e.conversion_value
	FROM campaign c, engagement e
	WHERE c.campaign_id = e.campaign_id;
	"""
)
fetched = get_data(query_string)
fetched

Unnamed: 0,campaign_id,campaign_name,start_date,end_date,target_segment,budget,channel,goal,displays,customer_id,engagement_date,action_type,device_type,feedback_score,conversion_value
0,0293,Campaign_293,2023-11-02,2023-11-15,Retirees,195.83,social,awareness,174671,0917,2023-04-20,scrolled,mobile,5,0.00
1,0017,Campaign_17,2023-01-02,2023-01-25,Retirees,313.81,influencer,consideration,281172,1945,2024-11-10,clicked,mobile,3,0.00
2,0083,Campaign_83,2024-12-06,2024-12-26,Families,299.17,email,awareness,260329,0631,2023-07-16,clicked,desktop,2,0.00
3,0125,Campaign_125,2023-06-07,2023-06-28,Retirees,217.44,email,retention,166563,1652,2024-01-24,scrolled,desktop,5,0.00
4,0155,Campaign_155,2024-10-04,2024-10-20,Families,68.92,influencer,retention,66405,1875,2024-12-20,scrolled,laptop,4,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0267,Campaign_267,2023-11-04,2023-11-11,High-income,154.66,sms,conversion,143666,1272,2024-04-27,clicked,mobile,2,0.00
99996,0130,Campaign_130,2023-11-14,2023-11-22,Retirees,372.64,email,consideration,322240,1919,2024-06-21,scrolled,mobile,3,0.00
99997,0018,Campaign_18,2024-01-14,2024-02-10,Young Adults,570.12,email,awareness,369796,0468,2024-09-27,scrolled,mobile,4,0.00
99998,0245,Campaign_245,2024-07-11,2024-07-30,High-income,112.55,sms,consideration,125391,1260,2024-12-24,scrolled,mobile,4,0.00


## Get Raw Clicks/Leads Data

In [10]:
query_string = sqlalchemy.text(
	"""
	SELECT t1.campaign_id,
	SUM(t1.budget) AS mark_spent,
	t1.start_date AS c_date,
	t1.channel as category,
	t1.displays,
	SUM(CASE WHEN t1.action_type = 'clicked' THEN 1 ELSE 0 END) AS clicks,
	SUM(CASE WHEN t1.action_type = 'credentials' THEN 1 ELSE 0 END) AS leads, 
	SUM(CASE WHEN t1.action_type = 'converted' THEN 1 ELSE 0 END) AS orders
	FROM 
	(SELECT c.campaign_id,
	c.campaign_name, 
	c.start_date, c.end_date,
	c.target_segment, 
	c.budget,
	c.channel,
	c.displays,
	e.customer_id, 
	e.engagement_date, 
	e.action_type, e.device_type, e.feedback_score,
	e.conversion_value
	FROM campaign c, engagement e
	WHERE c.campaign_id = e.campaign_id) AS t1
	GROUP BY t1.campaign_id, t1.channel
	ORDER BY t1.campaign_id, t1.start_date, t1.channel;
	"""
)
fetched = get_data(query_string)
fetched

Unnamed: 0,campaign_id,mark_spent,c_date,category,displays,clicks,leads,orders
0,0001,203839.02,2023-01-09,search,369796,110,34,53
1,0002,63420.50,2023-09-03,influencer,187097,89,22,31
2,0003,65315.12,2023-04-10,influencer,184686,88,32,4
3,0004,40365.45,2024-02-11,app,139851,85,24,65
4,0005,177393.43,2024-12-07,email,364506,114,18,36
...,...,...,...,...,...,...,...,...
294,0295,48448.88,2024-10-04,search,89240,90,29,2
295,0296,97807.13,2024-12-04,social,254233,95,29,21
296,0297,49018.48,2023-02-06,sms,97410,118,22,4
297,0298,47755.89,2024-09-15,email,137429,97,31,6


## Get Santander Dataset (Eng)

In [11]:
query_string = sqlalchemy.text(
	"""
	SELECT *
	FROM santender;
	"""
)
fetched = get_data(query_string)
fetched

Unnamed: 0,report_date,customer_id,employee_index,country_residence,gender,age,contract_start_date,new_customer_index,seniority_months,primary_customer_status,...,mortgage,pensions,loans,taxes,credit_card,securities,home_account,payroll,pensions_payments,direct_debit
0,2015-01-28,1013019,N,ES,H,32.0,2012-04-23,0.0,39.0,1.0,...,0,0,0,0,0,0,0,1.0,1.0,0
1,2015-01-28,1013023,N,ES,H,28.0,2012-04-23,0.0,39.0,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2,2015-01-28,1013024,N,ES,H,23.0,2012-04-23,0.0,39.0,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
3,2015-01-28,1013031,N,ES,V,40.0,2012-04-23,0.0,39.0,1.0,...,0,0,0,0,0,0,0,0.0,0.0,1
4,2015-01-28,1013035,N,ES,V,37.0,2012-04-23,0.0,39.0,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89996,2015-01-28,954177,N,ES,H,24.0,2011-10-04,0.0,45.0,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
89997,2015-01-28,954178,N,ES,V,28.0,2011-10-04,0.0,45.0,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
89998,2015-01-28,954179,N,ES,V,28.0,2011-10-04,0.0,45.0,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
89999,2015-01-28,954180,N,ES,V,26.0,2011-10-04,0.0,45.0,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
