<a href="https://colab.research.google.com/github/Baheback/RepositorioDelMal/blob/main/ProyectoFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
#  Análisis Exploratorio del Dataset de Credit Scoring

#  Importación de librerías
import pandas as pd
import plotly.express as px
import plotly.io as pio
import kagglehub
from pathlib import Path

#  Estilo de gráficos
pio.templates.default = "plotly_white"

#  Descargamos el dataset desde Kaggle
dataset_path = kagglehub.dataset_download("cs49adityarajsharma/credit-scoring-data")

#  Buscamos el archivo .csv en la carpeta descargada
csv_files = list(Path(dataset_path).rglob("*.csv"))

if len(csv_files) == 0:
    raise FileNotFoundError("❌ No se encontró ningún archivo CSV en el dataset.")
else:
    print(f"✅ Archivo encontrado: {csv_files[0]}")
    df = pd.read_csv(csv_files[0])



#  Exploración Inicial del Dataset
print("📌 Muestra del Dataset:")
display(df.head())

print("\n📐 Dimensiones del Dataset (filas, columnas):")
print(df.shape)

print("\n🔎 Tipos de variables y valores nulos:")
df.info()

print("\n📊 Estadísticas descriptivas:")
display(df.describe())

print("\n📋 Nombres de las columnas:")
print(df.columns.tolist())

# Limpiamos duplicados
df = df.drop_duplicates()

# Ponderamos valores alfanumericos y los transformamos a numeros --> Primer posible sesgo
nivel_educacion = {'High School': 1, 'Bachelor': 2, 'Master': 3, 'PhD': 4}
posicion_laboral = {'Unemployed': 0, 'Employed': 1, 'Self-Employed': 2}
posicion_social = {'Single': 0, 'Married': 1}
genero = {'Male':1, 'Female':0}

# Asignamos valores a los datos del dataset
df['Education Level'] = df['Education Level'].map(nivel_educacion)
df['Employment Status'] = df['Employment Status'].map(posicion_laboral)
df['Marital Status'] = df['Marital Status'].map(posicion_social)
df['Gender'] = df['Gender'].map(genero)

# Evaluamos cada credito asignandole un valor a cada campo siguiendo la recomendacion FICO y lo añadimos en una nueva columna
credit_scores = []

for index, row in df.iterrows():
    historial_de_pago = row['Payment History']
    porcentaje_de_endeudamiento = row['Credit Utilization Ratio']
    numero_de_creditos = row['Number of Credit Accounts']
    cantidad = row['Loan Amount']
    estado_laboral = row['Employment Status']
    genero = row['Gender']

    # Fórmula ponderada según FICO
    credit_score = (
        (historial_de_pago * 0.35) +
        (porcentaje_de_endeudamiento * 0.30) +
        (numero_de_creditos * 0.15) +
        (estado_laboral * 0.10) -
        (cantidad * 0.00010)
    )

    # CORREGIMOS SESGO
    if row['Gender'] == 'Female':
        redit_score = credit_score * 1  # Female
    else:
        credit_score = credit_score * 1   # Male

    credit_scores.append(credit_score)

# Add the credit scores as a new column to the DataFrame
df['Credit Score'] = credit_scores

print("Tabla ponderada")
print(df.head())

# Usamos KMeans para agrupar a los clientes basados en el Credit Score
from sklearn.cluster import KMeans

Puntuacion = df[['Credit Score']]
kmeans = KMeans(n_clusters=4, n_init=10, random_state=22)
kmeans.fit(Puntuacion)
df['Riesgo'] = kmeans.labels_

# Convert the 'Segment' column to category data type
df['Riesgo'] = df['Riesgo'].astype('category')

df['Riesgo'] = df['Riesgo'].map({      0: 'Elevado',
                                       1: 'Bajo',
                                       2: 'Critico',
                                       3: 'Moderado'})

# Visualizamos las etiquetas usando Plotly
fig = px.scatter(df, x=df.index, y='Credit Score', color='Riesgo',
                 color_discrete_sequence=['yellow', 'green', 'orange', 'red'])
fig.update_layout(
    xaxis_title='Customer Index',
    yaxis_title='Credit Score',
    title='Agrupacion de clientes basada en su puntuacion de credito'
)
fig.show()

✅ Archivo encontrado: /kaggle/input/credit-scoring-data/credit_scoring.csv
📌 Muestra del Dataset:


Unnamed: 0,Age,Gender,Marital Status,Education Level,Employment Status,Credit Utilization Ratio,Payment History,Number of Credit Accounts,Loan Amount,Interest Rate,Loan Term,Type of Loan
0,60,Male,Married,Master,Employed,0.22,2685.0,2,4675000,2.65,48,Personal Loan
1,25,Male,Married,High School,Unemployed,0.2,2371.0,9,3619000,5.19,60,Auto Loan
2,30,Female,Single,Master,Employed,0.22,2771.0,6,957000,2.76,12,Auto Loan
3,58,Female,Married,PhD,Unemployed,0.12,1371.0,2,4731000,6.57,60,Auto Loan
4,32,Male,Married,Bachelor,Self-Employed,0.99,828.0,2,3289000,6.28,36,Personal Loan



📐 Dimensiones del Dataset (filas, columnas):
(1000, 12)

🔎 Tipos de variables y valores nulos:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        1000 non-null   int64  
 1   Gender                     1000 non-null   object 
 2   Marital Status             1000 non-null   object 
 3   Education Level            1000 non-null   object 
 4   Employment Status          1000 non-null   object 
 5   Credit Utilization Ratio   1000 non-null   float64
 6   Payment History            1000 non-null   float64
 7   Number of Credit Accounts  1000 non-null   int64  
 8   Loan Amount                1000 non-null   int64  
 9   Interest Rate              1000 non-null   float64
 10  Loan Term                  1000 non-null   int64  
 11  Type of Loan               1000 non-null   object 
dtypes: float6

Unnamed: 0,Age,Credit Utilization Ratio,Payment History,Number of Credit Accounts,Loan Amount,Interest Rate,Loan Term
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,42.702,0.50995,1452.814,5.58,2471401.0,10.6866,37.128
std,13.266771,0.291057,827.934146,2.933634,1387047.0,5.479058,17.436274
min,20.0,0.0,0.0,1.0,108000.0,1.01,12.0
25%,31.0,0.25,763.75,3.0,1298000.0,6.0225,24.0
50%,42.0,0.53,1428.0,6.0,2437500.0,10.705,36.0
75%,54.0,0.75,2142.0,8.0,3653250.0,15.44,48.0
max,65.0,1.0,2857.0,10.0,4996000.0,19.99,60.0



📋 Nombres de las columnas:
['Age', 'Gender', 'Marital Status', 'Education Level', 'Employment Status', 'Credit Utilization Ratio', 'Payment History', 'Number of Credit Accounts', 'Loan Amount', 'Interest Rate', 'Loan Term', 'Type of Loan']
Tabla ponderada
   Age  Gender  Marital Status  Education Level  Employment Status  \
0   60       1             1.0                3                  1   
1   25       1             1.0                1                  0   
2   30       0             0.0                3                  1   
3   58       0             1.0                4                  0   
4   32       1             1.0                2                  2   

   Credit Utilization Ratio  Payment History  Number of Credit Accounts  \
0                      0.22           2685.0                          2   
1                      0.20           2371.0                          9   
2                      0.22           2771.0                          6   
3                     