## Pair Programming - Regresión logística 2

### Procesado

---

In [1]:
# Tratamiento de datos
import numpy as np
import pandas as pd

# Gráficas
import matplotlib.pyplot as plt
import seaborn as sns

# Estandarización variables numéricas y Codificación variables categóricas
from sklearn.preprocessing import StandardScaler

# Gestión datos desbalanceados

from imblearn.combine import SMOTETomek

# Para separar los datos en train y test
from sklearn.model_selection import train_test_split

#  Configuración de warnings
import warnings
warnings.filterwarnings("ignore")

In [660]:
#pd.options.display.max_columns = None
#pd.options.display.max_rows = None

In [2]:
df = pd.read_csv('../archivos/travel_1.csv', index_col=0)
df.head()

Unnamed: 0,agency,agency_type,distribution_channel,product_name,claim,duration,destination,net_sales,commision_(in_value),gender,age,continent,cat_age,cat_duration
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81,Asia,retired,year
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71,Asia,retired,year
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,PNS,32,Oceania,adult youth,three months
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,PNS,32,Oceania,adult youth,three months
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,PNS,41,Europe,adult,three months


In [3]:
# Borramos las columnas que no están categorizadas
df.drop(["destination", "age", "duration"], axis=1, inplace=True)

**Objetivos:**

### 1. Estandarizar las variables numéricas de vuestro set de datos.

In [4]:
#Iniciamos el método 
scaler = StandardScaler()

In [5]:
#Seleccionamos las columnas numéricas que queremos estandarizar
numericas = df.select_dtypes(include = np.number)
numericas.head()

Unnamed: 0,net_sales,commision_(in_value)
0,-29.0,9.57
1,-29.0,9.57
2,-49.5,29.7
3,-39.6,23.76
4,-19.8,11.88


In [6]:
# ahora ya podemos ajustar nuestros datos.  

scaler.fit(numericas)

# transformamos los datos

X_escaladas = scaler.transform(numericas)

# por último convertiremos el array que nos devuelve en un dataframe. 

numericas_estandar = pd.DataFrame(X_escaladas, columns = numericas.columns).reset_index()
numericas_estandar.tail()

Unnamed: 0,index,net_sales,commision_(in_value)
55279,55279,-0.491371,-0.211869
55280,55280,-0.154391,0.075897
55281,55281,-0.055279,0.160534
55282,55282,-0.491371,-0.211869
55283,55283,-0.332792,-0.076449


In [7]:
numericas_estandar.columns = numericas_estandar.columns + "_stand"

In [8]:
df= df.reset_index()

In [9]:
# Unimos los dfs manteniendo las columnas numéricas estandarizadas y sin estandarizar
df_union = pd.concat([df, numericas_estandar], axis=1)

In [10]:
df_union.head(2)

Unnamed: 0,index,agency,agency_type,distribution_channel,product_name,claim,net_sales,commision_(in_value),gender,continent,cat_age,cat_duration,index_stand,net_sales_stand,commision_(in_value)_stand
0,0,CBH,Travel Agency,Offline,Comprehensive Plan,No,-29.0,9.57,F,Asia,retired,year,0,-1.423021,-0.053718
1,1,CBH,Travel Agency,Offline,Comprehensive Plan,No,-29.0,9.57,F,Asia,retired,year,1,-1.423021,-0.053718


In [11]:
df_union.drop(["index", "index_stand"], axis=1, inplace=True)

### 2. Codificar las variables categóricas. Tener en cuenta si las variables tienen orden o no.

- **agency**: es de tipo object--------------------------  No orden

- **agency_type** : es de tipo object--------------------  No orden

- **distribution_channel**: es de tipo object------------  No orden

- **product_name**: es de tipo object--------------------  Orden

- **claim**: Variable respuesta dicotómica de tipo object- Orden

- **gender**: es de tipo object--------------------------  No orden

- **continent**: es de tipo object-----------------------  Orden

- **cat_age**: es de tipo object-------------------------  Orden

- **cat_duration**: es de tipo object--------------------  Orden 

In [671]:
df_union.head(2)

Unnamed: 0,agency,agency_type,distribution_channel,product_name,claim,net_sales,commision_(in_value),gender,continent,cat_age,cat_duration,net_sales_stand,commision_(in_value)_stand
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,-29.0,9.57,F,Asia,retired,year,-1.423021,-0.053718
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,-29.0,9.57,F,Asia,retired,year,-1.423021,-0.053718


### 2.1. Variables que no tienen orden.

In [12]:
lista_columnas = ["agency", "agency_type", "distribution_channel", "gender"]

df_encoded = pd.DataFrame()


for columna in lista_columnas:
    df_dummies = pd.get_dummies(df[columna], prefix_sep = "_", prefix = columna, dtype = int)

    df_encoded = pd.concat([df_encoded, df_dummies], axis = 1)

In [13]:
df_encoded.head(2)

Unnamed: 0,agency_ADM,agency_ART,agency_C2B,agency_CBH,agency_CCR,agency_CSR,agency_CWT,agency_EPX,agency_JWT,agency_JZI,...,agency_SSI,agency_TST,agency_TTW,agency_type_Airlines,agency_type_Travel Agency,distribution_channel_Offline,distribution_channel_Online,gender_F,gender_M,gender_PNS
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0


In [14]:
df_union =df_union.reset_index()

In [15]:
df_codificado = pd.concat([df_union, df_encoded], axis=1)

In [16]:
df_codificado.drop(["agency", "agency_type", "distribution_channel", "gender", "index"], axis=1, inplace=True)

### 2.2. Variables que tienen orden.

In [17]:
mapa_product_name = {"Cancellation Plan" : 14, "2 way Comprehensive Plan": 13,"Rental Vehicle Excess Insurance": 12, "Basic Plan": 11,"Bronze Plan" : 10, "1 way Comprehensive Plan" : 9,
                     "Value Plan": 8, "Silver Plan" : 7, "Others insurances": 1, "Annual Silver Plan" : 6, "Ticket Protector" :5,"Travel Cruise Protect" : 4,"Comprehensive Plan" : 3, "Gold Plan": 2}

mapa_claim = {"Yes": 1, "No": 0}

mapa_continent = {'Asia': 6, 'Europe': 5, 'Oceania': 4, 'North America': 3, 'Africa': 2,'South America': 1}

mapa_cat_age = {"adult": 6, "adult youth": 5, "senior": 4, "retired":3 , "youth" :2, "child":1   }

mapa_cat_duration = {"three months":8, "month" :7, "two weeks":6, "year":5, "week":4,"weekend":3,"extra long":2,  "one day":1 }

In [18]:
df_codificado["product_name"] = df_codificado["product_name"].map(mapa_product_name)
df_codificado["claim"] = df_codificado["claim"].map(mapa_claim)
df_codificado["continent"] = df_codificado["continent"].map(mapa_continent)
df_codificado["cat_age"] = df_codificado["cat_age"].map(mapa_cat_age)
df_codificado["cat_duration"] = df_codificado["cat_duration"].map(mapa_cat_duration)

In [19]:
df_codificado.tail()

Unnamed: 0,product_name,claim,net_sales,commision_(in_value),continent,cat_age,cat_duration,net_sales_stand,commision_(in_value)_stand,agency_ADM,...,agency_SSI,agency_TST,agency_TTW,agency_type_Airlines,agency_type_Travel Agency,distribution_channel_Offline,distribution_channel_Online,gender_F,gender_M,gender_PNS
55279,11,0,18.0,6.3,6,5,4,-0.491371,-0.211869,0,...,0,0,0,1,0,0,1,0,1,0
55280,11,0,35.0,12.25,6,5,5,-0.154391,0.075897,0,...,0,0,0,1,0,0,1,0,1,0
55281,11,0,40.0,14.0,6,6,8,-0.055279,0.160534,0,...,0,0,0,1,0,0,1,1,0,0
55282,11,0,18.0,6.3,6,4,3,-0.491371,-0.211869,0,...,0,0,0,1,0,0,1,0,1,0
55283,11,0,26.0,9.1,6,5,7,-0.332792,-0.076449,0,...,0,0,0,1,0,0,1,1,0,0


### 3. Chequear si los datos están balanceados. En caso de que no lo estén utilizad algunas de las herramientas aprendidas en la lección para balancearlos.

Por motivos de que este jupyter no pese mucho más haremos el balanceado en el archivo "pp_regresion_logistica_2_Balanceo"

### 4. Guardad el dataframe.

In [32]:
df_codificado.to_csv("../archivos/travel_enco_stand.csv")