# Customer Personality Analysis 


## 1. Importar Librerías

In [None]:
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuración de visualización
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 2. Descargar y Cargar Dataset

In [None]:
# Descargar dataset desde Kaggle
path = kagglehub.dataset_download("imakash3011/customer-personality-analysis")
print("Path to dataset files:", path)

In [None]:
# Cargar el dataset
import os
files = os.listdir(path)
print("Archivos disponibles:", files)

# Cargar el CSV
df = pd.read_csv(os.path.join(path, 'marketing_campaign.csv'), sep='\t')
print(f"\nDataset cargado: {df.shape[0]} filas y {df.shape[1]} columnas")

## 3. Exploración Inicial de Datos

In [None]:
# Primeras filas
df.head()

In [None]:
# Información general
df.info()

In [None]:
# Estadísticas descriptivas
df.describe()

In [None]:
# Valores nulos
print("Valores nulos por columna:")
null_counts = df.isnull().sum()
null_percentage = (null_counts / len(df)) * 100
null_df = pd.DataFrame({'Nulos': null_counts, 'Porcentaje': null_percentage})
print(null_df[null_df['Nulos'] > 0])

In [None]:
# Valores duplicados
duplicates = df.duplicated().sum()
print(f"Filas duplicadas: {duplicates}")

## 4. Limpieza de Datos

In [None]:
# Crear copia para preprocesamiento
df_clean = df.copy()

print(f"Dataset original: {df_clean.shape}")

In [None]:
# Manejar valores nulos en Income (imputar con mediana)
if 'Income' in df_clean.columns:
    median_income = df_clean['Income'].median()
    df_clean['Income'].fillna(median_income, inplace=True)
    print(f"Valores nulos en Income imputados con la mediana: {median_income}")

In [None]:
# Eliminar duplicados
df_clean.drop_duplicates(inplace=True)
print(f"Dataset después de eliminar duplicados: {df_clean.shape}")

## 5. Ingeniería de Características

In [None]:
# Convertir Dt_Customer a datetime y calcular antigüedad
if 'Dt_Customer' in df_clean.columns:
    df_clean['Dt_Customer'] = pd.to_datetime(df_clean['Dt_Customer'], format='%d-%m-%Y')
    reference_date = df_clean['Dt_Customer'].max()
    df_clean['Customer_Days'] = (reference_date - df_clean['Dt_Customer']).dt.days
    df_clean['Customer_Years'] = df_clean['Customer_Days'] / 365.25
    print("Antigüedad del cliente calculada")

In [None]:
# Calcular edad a partir del año de nacimiento
if 'Year_Birth' in df_clean.columns:
    current_year = datetime.now().year
    df_clean['Age'] = current_year - df_clean['Year_Birth']
    
    # Eliminar outliers de edad (menores de 18 o mayores de 100)
    df_clean = df_clean[(df_clean['Age'] >= 18) & (df_clean['Age'] <= 100)]
    print(f"Edad calculada. Dataset después de filtrar outliers: {df_clean.shape}")

In [None]:
# Calcular total de hijos
if 'Kidhome' in df_clean.columns and 'Teenhome' in df_clean.columns:
    df_clean['Total_Children'] = df_clean['Kidhome'] + df_clean['Teenhome']
    df_clean['Has_Children'] = (df_clean['Total_Children'] > 0).astype(int)
    print("Total de hijos calculado")

In [None]:
# Calcular gasto total
spending_columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 
                   'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']

if all(col in df_clean.columns for col in spending_columns):
    df_clean['Total_Spending'] = df_clean[spending_columns].sum(axis=1)
    print("Gasto total calculado")

In [None]:
# Calcular total de compras
purchase_columns = ['NumWebPurchases', 'NumCatalogPurchases', 
                   'NumStorePurchases', 'NumDealsPurchases']

if all(col in df_clean.columns for col in purchase_columns):
    df_clean['Total_Purchases'] = df_clean[purchase_columns].sum(axis=1)
    print("Total de compras calculado")

In [None]:
# Calcular tasa de aceptación de campañas
campaign_columns = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 
                   'AcceptedCmp4', 'AcceptedCmp5', 'Response']

if all(col in df_clean.columns for col in campaign_columns):
    df_clean['Total_Campaigns_Accepted'] = df_clean[campaign_columns].sum(axis=1)
    print("Total de campañas aceptadas calculado")

In [None]:
# Calcular gasto promedio por compra
if 'Total_Spending' in df_clean.columns and 'Total_Purchases' in df_clean.columns:
    df_clean['Avg_Spending_Per_Purchase'] = df_clean['Total_Spending'] / (df_clean['Total_Purchases'] + 1)
    print("Gasto promedio por compra calculado")

## 6. Transformación de Variables Categóricas

In [None]:
# Analizar columnas categóricas
categorical_columns = df_clean.select_dtypes(include=['object']).columns
print("Columnas categóricas:")
for col in categorical_columns:
    print(f"\n{col}: {df_clean[col].unique()}")

In [None]:
# Simplificar Education
if 'Education' in df_clean.columns:
    education_mapping = {
        'Graduation': 'Graduate',
        'PhD': 'Postgraduate',
        'Master': 'Postgraduate',
        'Basic': 'Undergraduate',
        '2n Cycle': 'Undergraduate'
    }
    df_clean['Education_Level'] = df_clean['Education'].map(education_mapping)
    print("\nEducation simplificado:")
    print(df_clean['Education_Level'].value_counts())

In [None]:
# Simplificar Marital_Status
if 'Marital_Status' in df_clean.columns:
    marital_mapping = {
        'Married': 'Partner',
        'Together': 'Partner',
        'Single': 'Single',
        'Divorced': 'Single',
        'Widow': 'Single',
        'Alone': 'Single',
        'Absurd': 'Single',
        'YOLO': 'Single'
    }
    df_clean['Relationship_Status'] = df_clean['Marital_Status'].map(marital_mapping)
    print("\nMarital_Status simplificado:")
    print(df_clean['Relationship_Status'].value_counts())

In [None]:
# Crear dummies para variables categóricas
df_encoded = df_clean.copy()

if 'Education_Level' in df_encoded.columns:
    education_dummies = pd.get_dummies(df_encoded['Education_Level'], prefix='Education')
    df_encoded = pd.concat([df_encoded, education_dummies], axis=1)

if 'Relationship_Status' in df_encoded.columns:
    relationship_dummies = pd.get_dummies(df_encoded['Relationship_Status'], prefix='Relationship')
    df_encoded = pd.concat([df_encoded, relationship_dummies], axis=1)

print(f"\nDataset con variables dummy: {df_encoded.shape}")

## 7. Detección y Tratamiento de Outliers

In [None]:
# Visualizar outliers en Income
if 'Income' in df_clean.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Boxplot
    axes[0].boxplot(df_clean['Income'])
    axes[0].set_title('Boxplot de Income')
    axes[0].set_ylabel('Income')
    
    # Histogram
    axes[1].hist(df_clean['Income'], bins=50, edgecolor='black')
    axes[1].set_title('Distribución de Income')
    axes[1].set_xlabel('Income')
    axes[1].set_ylabel('Frecuencia')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Eliminar outliers usando IQR para Income
if 'Income' in df_clean.columns:
    Q1 = df_clean['Income'].quantile(0.25)
    Q3 = df_clean['Income'].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    print(f"Income - Límite inferior: {lower_bound}, Límite superior: {upper_bound}")
    
    before = len(df_clean)
    df_clean = df_clean[(df_clean['Income'] >= lower_bound) & (df_clean['Income'] <= upper_bound)]
    after = len(df_clean)
    
    print(f"Outliers eliminados: {before - after}")
    print(f"Dataset final: {df_clean.shape}")

## 8. Normalización y Escalado

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Seleccionar columnas numéricas para escalar
numeric_columns = df_clean.select_dtypes(include=[np.number]).columns.tolist()

# Excluir columnas que no necesitan escalado
columns_to_exclude = ['ID', 'Year_Birth', 'Z_CostContact', 'Z_Revenue']
columns_to_scale = [col for col in numeric_columns if col not in columns_to_exclude]

print(f"Columnas a escalar: {len(columns_to_scale)}")

In [None]:
# Crear dataset con StandardScaler
df_standardized = df_clean.copy()
scaler_standard = StandardScaler()

df_standardized[columns_to_scale] = scaler_standard.fit_transform(df_clean[columns_to_scale])
print("StandardScaler aplicado")

In [None]:
# Crear dataset con MinMaxScaler
df_normalized = df_clean.copy()
scaler_minmax = MinMaxScaler()

df_normalized[columns_to_scale] = scaler_minmax.fit_transform(df_clean[columns_to_scale])
print("MinMaxScaler aplicado")

## 9. Resumen Final

In [None]:
print("="*50)
print("RESUMEN DEL PREPROCESAMIENTO")
print("="*50)
print(f"\nDataset original: {df.shape}")
print(f"Dataset limpio: {df_clean.shape}")
print(f"Dataset con encoding: {df_encoded.shape}")
print(f"\nFilas eliminadas: {df.shape[0] - df_clean.shape[0]}")
print(f"Nuevas columnas creadas: {df_clean.shape[1] - df.shape[1]}")
print(f"\nValores nulos restantes: {df_clean.isnull().sum().sum()}")

In [None]:
# Visualizar correlación
plt.figure(figsize=(20, 16))
correlation_matrix = df_clean[columns_to_scale].corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Matriz de Correlación de Variables Numéricas', fontsize=16)
plt.tight_layout()
plt.show()

## 10. Guardar Datos Procesados

In [None]:
# Guardar datasets procesados
df_clean.to_csv('customer_data_clean.csv', index=False)
df_encoded.to_csv('customer_data_encoded.csv', index=False)
df_standardized.to_csv('customer_data_standardized.csv', index=False)
df_normalized.to_csv('customer_data_normalized.csv', index=False)

print("Datasets guardados exitosamente:")
print("- customer_data_clean.csv")
print("- customer_data_encoded.csv")
print("- customer_data_standardized.csv")
print("- customer_data_normalized.csv")