<a href="https://colab.research.google.com/github/CristValen/Acciones-RNR/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf, col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast("double"))

# Establecer la semilla para reproducibilidad
seed = 12345

# Dividir los datos en conjuntos de entrenamiento y prueba
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Definir características y etiqueta
features = df8.columns
features.remove('label')

assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Random Forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=seed)

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Definir la cuadrícula de parámetros para validación cruzada
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()

# Definir el evaluador para validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")


cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator)

model = cv.fit(train)

predictions_train = model.transform(train)

evaluator_train_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc_train = evaluator_train_roc_auc.evaluate(predictions_train)
evaluator_train_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_train = evaluator_train_accuracy.evaluate(predictions_train)

print(f"Entrenamiento ROC-AUC: {roc_auc_train:.3f}")
print(f"Entrenamiento Precisión: {accuracy_train:.3f}")

# Calcular la matriz de confusión para los datos de entrenamiento
predictionAndLabels_train = predictions_train.select("prediction", "label").rdd
metrics_train = MulticlassMetrics(predictionAndLabels_train)
confusion_matrix_train = metrics_train.confusionMatrix().toArray()
print(f"Matriz de confusión de entrenamiento:\n{confusion_matrix_train}")

# Calcular manualmente la recuperación y el puntaje F1 para los datos de entrenamiento
TP_train = confusion_matrix_train[1, 1]
FP_train = confusion_matrix_train[0, 1]
FN_train = confusion_matrix_train[1, 0]
precision_manual_train = TP_train / (TP_train + FP_train)
recall_manual_train = TP_train / (TP_train + FN_train)
f1_manual_train = 2 * precision_manual_train * recall_manual_train / (precision_manual_train + recall_manual_train)
print(f"Recuperación de entrenamiento (calculada manualmente): {recall_manual_train:.3f}")
print(f"Puntaje F1 de entrenamiento (calculado manualmente): {f1_manual_train:.3f}")

def calc_ks(data):
    data_pd = data.toPandas()
    data_pd['good'] = (data_pd['label'] == 0).astype(int)
    data_pd['bad'] = (data_pd['label'] == 1).astype(int)
    from pyspark.ml.linalg import VectorUDT

    if isinstance(data.schema["probability"].dataType, VectorUDT):
        # Si la columna "probability" contiene objetos DenseVector, extrae el valor relevante
        data_pd['bucket'] = (data_pd['probability'].apply(lambda x: x[1]).rank(pct=True) * 10).astype(int)
    else:
        # Si la columna "probability" contiene valores de tipo float, aplica directamente el método rank
        data_pd['bucket'] = (data_pd['probability'].rank(pct=True) * 10).astype(int)
    grouped = data_pd.groupby('bucket', as_index=True)
    kstable = grouped.min().probability.to_frame(name='min_probability')
    kstable['max_probability'] = grouped.max().probability
    kstable['bads'] = grouped.sum().bad
    kstable['goods'] = grouped.sum().good
    kstable.reset_index(inplace=True)
    kstable['bad_rate'] = kstable.bads / (kstable.bads + kstable.goods)
    kstable['ks'] = (kstable.bads / kstable.bads.sum()).cumsum() - \
                    (kstable.goods / kstable.goods.sum()).cumsum()
    ks_value = kstable.ks.abs().max()
    return ks_value

# Calcular el estadístico KS para los datos de entrenamiento
ks_value_train = calc_ks(predictions_train)
print(f"Entrenamiento KS: {ks_value_train:.3f}")

# Obtener las características más importantes
importances = model.bestModel.stages[-1].featureImportances
important_features = sorted(zip(importances, features), reverse=True)
print("Características más importantes:")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")



