Defincion de módulos con scripts de funciones

In [None]:
import sys
import os

#Añadir el directorio raíz del proyecto al sys.path
sys.path.append(os.path.abspath(os.path.join('..')))

#Importar modulos
import Paquete.carga_datos as cdata
import Paquete.preprocesamiento as prep
import Paquete.visualizaciones as vis
import Paquete.modelaje as md

from sklearn.model_selection import train_test_split
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

seed = 123

Lectura de datos

In [None]:
ruta = cdata.GetRutaInput('Data.xlsx')
data = cdata.LecturaData(ruta)

In [None]:
#visualizar el dataframe
data

In [None]:
#Filas y columnas
data.shape

In [None]:
#Tipo de datos
data.dtypes

In [None]:
#Cardinalidad de variables
prep.UnicosFeature(data)

In [None]:
#Recuento de vacíos
data.isna().sum()

In [None]:
#Drop de ID customer, al ser unica no tiene valor predictivo
data= data.drop(['customerID'], axis = 1)

Splitting de data en training y testing (para evitar data leakeage)

In [None]:
#Definición de variables por rol en el modelo
Target ='Churn'
FeaturesNumericos = ['MonthlyCharges', 'TotalCharges', 'tenure']
FeaturesCategoricos = data.columns.drop(Target).drop(FeaturesNumericos)


In [None]:
#Labeling
X = data.drop(columns = Target, inplace = False)
y = pd.DataFrame(data[Target].map({'Yes': 1, 'No':0}))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed)

Preprocesamiento Variables Numericas (sobre training set)

In [None]:
X_num = X_train[FeaturesNumericos]
X_cat = X_train[FeaturesCategoricos]
#Aplicación de funciones a df de variables numericas

X_num = prep.CambioNoNumerico(X_num, X_num.columns)
X_num = prep.ToFloat(X_num, X_num.columns)
X_num = prep.RellenoNA(X_num, X_num.columns)

In [None]:
#Checks
print("Cantidad de NAs en Dataframe: ", X_num.isna().sum().sum())
print("Tipo de datos: ", list(zip(FeaturesNumericos, X_num.dtypes.to_list())))

Visualizaciones (sobre training set)

In [None]:
#Histogramas
vis.HistogramasIndividuales(X_num)

In [None]:
#Densidades suavizadas con kernell
vis.DensidadSuavizadaSimultanea(X_num, X_num.columns)

In [None]:
#Boxplot iterativo
vis.BoxPlotIterativo(X_num, y_train, Target)

In [None]:
#Evaluacion de desbalanceo de clases
vis.PlotDesbalanceoClass(y_train,y_train.columns)

In [None]:
#Heatmap correlaciones
vis.HeatmapCorr(X_num, y_train, annotacion=True, mostrar_ejes= True, mostrar_barra_color=True, largo=4, ancho=3)

In [None]:
#Drop de columnas numericas
Col_Numericas_ToDrop = ['TotalCharges']
X_num = X_num.drop(Col_Numericas_ToDrop, axis = 1)

Preprocesamiento Categorico (sobre training set)

In [None]:
#Simplificacion de features con cardinalidad de 3 (transformados a binarios)
Features_a_Binarios = ['MultipleLines','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
prep.aBinario(X_cat, Features_a_Binarios)

In [None]:
X_cat = prep.EncodingBinario(X_cat, X_cat.columns)

In [None]:
#Prueba Chi2 de independencia para eliminar features cateogoricos
ColsToDrop = prep.Chi2Filtering(X_cat, X_cat.columns, y_train, 0.1)
X_cat = X_cat.drop(columns = ColsToDrop)

In [None]:
#One hot encoding
X_cat = prep.OneHotEncode(X_cat, X_cat.columns)

Preprocesamiento conjunto (sobre Training set)

In [None]:
X_train =pd.concat([X_num,X_cat], axis = 1)
X_train

In [None]:
#Scaling
X_train = prep.escala_data(X_train, method= 'standard')
X_train

In [None]:
vis.DensidadSuavizadaSimultanea(X_train,['MonthlyCharges','tenure'], linewidth = 4,sizeL=10, sizeW=3)

Preprocesamiento (sobre Testing set)

In [None]:
X_num = X_test[FeaturesNumericos]
X_cat = X_test[FeaturesCategoricos]
#Aplicación de funciones a df de variables numericas

X_num = prep.CambioNoNumerico(X_num, X_num.columns)
X_num = prep.ToFloat(X_num, X_num.columns)
X_num = prep.RellenoNA(X_num, X_num.columns)

#Drop de columnas numericas
Col_Numericas_ToDrop = ['TotalCharges']
X_num = X_num.drop(Col_Numericas_ToDrop, axis = 1)

#Simplificacion de features con cardinalidad de 3 (transformados a binarios)
Features_a_Binarios = ['MultipleLines','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
prep.aBinario(X_cat, Features_a_Binarios)

X_cat = prep.EncodingBinario(X_cat, X_cat.columns)
X_cat = X_cat.drop(columns = ColsToDrop)
#One hot encoding
X_cat = prep.OneHotEncode(X_cat, X_cat.columns)
X_test =pd.concat([X_num,X_cat], axis = 1)

#Scaling
X_test = prep.escala_data(X_test, method= 'standard')

X_test

Seleccion de Modelo

In [None]:
MetricaEval = 'accuracy'

In [None]:
#Regresion Logistica
md.SeleccionModelo(X_train, y_train, X_test, y_test, ModelSelected = 'Logistic Regression', metric = MetricaEval, seed = seed, iteraciones = 100 )

In [None]:
#Suport Vector Machine
md.SeleccionModelo(X_train, y_train, X_test, y_test, ModelSelected = 'SVM', metric = MetricaEval, seed = seed, iteraciones = 100 )

In [None]:
#k-Nearest Neighbors
md.SeleccionModelo(X_train, y_train, X_test, y_test, ModelSelected = 'KNN', metric = MetricaEval, seed = seed, iteraciones = 100)

In [None]:
#Arbol de decision
md.SeleccionModelo(X_train, y_train, X_test, y_test, ModelSelected = 'Decision Tree', metric = MetricaEval, seed = seed, iteraciones = 100)

In [None]:
#Random Forest
md.SeleccionModelo(X_train, y_train, X_test, y_test, ModelSelected = 'Random Forest', metric = MetricaEval, seed = seed, iteraciones = 100 )

In [None]:
#Gradient boosting
md.SeleccionModelo(X_train, y_train, X_test, y_test, ModelSelected = 'Gradient Boosting', metric = MetricaEval, seed = seed, iteraciones = 10 )