In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = None

In [2]:
df = pd.read_csv('datos/bank_additional_editado.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y,month_day_week
0,56,housemaid,married,basic.4y,0.0,0.0,0.0,telephone,261,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191.0,no,"['may', 'mon']"
1,57,services,married,high.school,,0.0,0.0,telephone,149,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191.0,no,"['may', 'mon']"
2,37,services,married,high.school,0.0,1.0,0.0,telephone,226,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191.0,no,"['may', 'mon']"
3,40,admin.,married,basic.6y,0.0,0.0,0.0,telephone,151,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191.0,no,"['may', 'mon']"
4,56,services,married,high.school,0.0,0.0,1.0,telephone,307,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191.0,no,"['may', 'mon']"


## Pair Programming Limpieza II

### Hipótesis

1. La edad, el trabajo, el estado civil, la educación, la situación de deuda y la forma de contacto pueden influir en la probabilidad de que un cliente acepte la oferta.

2. El número de veces que se ha contactado a un cliente en el pasado (campo campaign), el número de días que han pasado desde el último contacto (campo pdays), y el resultado de la campaña anterior (campo poutcome) pueden afectar la respuesta del cliente a una nueva oferta.

3. Las variables económicas (tales como el índice de precios al consumidor (cons.price.idx), la tasa de variación del empleo (emp.var.rate), etc.) pueden influir en la probabilidad de que un cliente acepte la oferta.

4. Los clientes que ya tienen una hipoteca (housing) o un préstamo (loan) pueden ser menos propensos a aceptar una nueva oferta, ya que podrían estar limitados financieramente.

### Tareas

1. Columnas loan, housing y default: estas columnas contienen valores únicos de 0 y 1. Esto puede parecer poco intuitivo a la hora de la extracción de conclusiones y en las visualizaciones. El objetivo de este ejercicio es que cambies los valores númericos por "Si" y "No". A que corresponde cada uno de los valores lo tenéis en el pair de Limpieza I.

In [4]:
df.sample()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y,month_day_week
28149,45,blue-collar,single,basic.9y,0.0,1.0,0.0,cellular,173,1,999,0,NONEXISTENT,-1.8,93.075,-47.1,1.466,5099.1,no,"['apr', 'mon']"


In [5]:
mapa = {1.0: 'si', 0.0: 'no'}

In [6]:
df['loan'] = df['loan'].map(mapa, na_action= 'ignore')
df['housing'] = df['housing'].map(mapa, na_action= 'ignore')
df['default'] = df['default'].map(mapa, na_action= 'ignore')

In [7]:
df.sample(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y,month_day_week
22966,59,management,married,university.degree,no,no,si,cellular,168,3,999,0,NONEXISTENT,1.4,93.444,-36.1,4.965,5228.1,no,"['aug', 'mon']"
5527,42,entrepreneur,married,basic.6y,no,no,no,telephone,204,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191.0,no,"['may', 'mon']"
24920,34,blue-collar,single,high.school,no,si,no,cellular,257,1,999,0,NONEXISTENT,-0.1,93.2,-42.0,4.153,5195.8,no,"['nov', 'tue']"
33343,28,unemployed,married,high.school,,no,no,cellular,216,1,999,1,FAILURE,-1.8,92.893,-46.2,1.291,5099.1,no,"['may', 'tue']"
23889,41,technician,married,professional.course,no,si,no,cellular,307,9,999,0,NONEXISTENT,1.4,93.444,-36.1,4.963,5228.1,no,"['aug', 'fri']"
36904,60,retired,divorced,basic.4y,no,no,no,cellular,111,5,999,0,NONEXISTENT,-2.9,92.963,-40.8,1.215,5076.2,no,"['jun', 'mon']"
19765,50,unemployed,married,professional.course,,no,no,cellular,135,1,999,0,NONEXISTENT,1.4,93.444,-36.1,4.966,5228.1,no,"['aug', 'fri']"
2641,42,admin.,single,university.degree,no,si,no,telephone,72,4,999,0,NONEXISTENT,1.1,93.994,-36.4,4.856,5191.0,no,"['may', 'tue']"
3801,40,blue-collar,married,basic.6y,no,si,no,telephone,130,3,999,0,NONEXISTENT,1.1,93.994,-36.4,4.859,5191.0,no,"['may', 'fri']"
38997,47,admin.,married,university.degree,no,si,si,cellular,67,5,999,0,NONEXISTENT,-3.0,92.713,-33.0,0.717,5023.5,no,"['dec', 'mon']"


2. Para la columna de education, si nos fijamos en sus valores únicos veremos que tenemos puntos. El objetivo de este ejercicio es que quitéis los puntos de esos valores y los reemplacéis por espacios.

In [8]:
df['education'] = df['education'].str.replace('.', ' ')

  df['education'] = df['education'].str.replace('.', ' ')


In [9]:
df.sample()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y,month_day_week
3505,44,admin.,single,high school,no,si,no,telephone,347,4,999,0,NONEXISTENT,1.1,93.994,-36.4,4.86,5191.0,no,"['may', 'thu']"


3. Para la columna job, hay un valor único que esta abreviado (admin.), cambiad la abreviatura por el nombre completo.

In [10]:
df['job'] = df['job'].str.replace('admin.', 'administration')
df.sample(10)

  df['job'] = df['job'].str.replace('admin.', 'administration')


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y,month_day_week
2723,45,,married,,,,,telephone,179,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.859,5191.0,no,"['may', 'wed']"
32051,39,blue-collar,married,basic 4y,no,no,no,cellular,155,1,999,0,NONEXISTENT,-1.8,92.893,-46.2,1.327,5099.1,no,"['may', 'thu']"
7034,38,self-employed,married,,no,si,no,telephone,118,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.86,5191.0,no,"['may', 'thu']"
139,45,blue-collar,married,basic 9y,,si,no,telephone,461,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191.0,yes,"['may', 'mon']"
2776,49,blue-collar,married,basic 9y,no,si,no,telephone,87,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.859,5191.0,no,"['may', 'wed']"
4565,29,services,married,high school,no,no,no,telephone,189,2,999,0,NONEXISTENT,1.1,93.994,-36.4,4.856,5191.0,no,"['may', 'tue']"
31886,36,blue-collar,married,basic 9y,no,no,no,cellular,115,3,999,0,NONEXISTENT,-1.8,92.893,-46.2,1.327,5099.1,no,"['may', 'thu']"
5497,44,technician,married,university degree,no,no,no,telephone,125,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191.0,no,"['may', 'mon']"
26194,31,administration,married,university degree,no,si,no,cellular,298,2,5,1,SUCCESS,-0.1,93.2,-42.0,4.076,5195.8,no,"['nov', 'thu']"
15897,31,services,divorced,high school,no,si,no,cellular,522,1,999,0,NONEXISTENT,1.4,93.918,-42.7,4.96,5228.1,no,"['jul', 'mon']"


4. La columna month_day_week tiene una lista que contiene información muy diferente. El objetivo de este ejercicio es separar esta columna en dos nuevas, donde tengamos los meses en una columna y los días de la semana en otra.

In [11]:
#df['month_day_week'] = df['month_day_week'].str.replace('[]', 'administration')

In [12]:
df['month'] = df['month_day_week'].map(lambda x: x[2:5])

In [13]:
df['day'] = df['month_day_week'].map(lambda x: x[9:12])

In [14]:
df.sample()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y,month_day_week,month,day
25150,45,technician,divorced,professional course,no,si,si,cellular,124,1,999,0,NONEXISTENT,-0.1,93.2,-42.0,4.153,5195.8,no,"['nov', 'tue']",nov,tue


5. Guarda el csv con las columnas limpias para seguir trabajando con este dataframe limpio.

In [15]:
df.to_csv('datos/bank_additional_editado.csv')