<a href="https://colab.research.google.com/github/CristValen/Acciones-RNR/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the data into training and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Define features and label
features = df8.columns
features.remove('label')

assembler = VectorAssembler(inputCols=features, outputCol="features")

# Create the Random Forest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=seed)

# Create the pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()

# Define the evaluator for cross-validation
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")


cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator)

model = cv.fit(train)

predictions_train = model.transform(train)

evaluator_train_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc_train = evaluator_train_roc_auc.evaluate(predictions_train)
evaluator_train_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_train = evaluator_train_accuracy.evaluate(predictions_train)

print(f"Training ROC-AUC: {roc_auc_train:.3f}")
print(f"Training Accuracy: {accuracy_train:.3f}")

# Calculate the confusion matrix for training data
predictionAndLabels_train = predictions_train.select("prediction", "label").rdd
metrics_train = MulticlassMetrics(predictionAndLabels_train)
confusion_matrix_train = metrics_train.confusionMatrix().toArray()
print(f"Training Confusion matrix:\n{confusion_matrix_train}")

# Manually calculate recall and F1 score for training data
TP_train = confusion_matrix_train[1, 1]
FP_train = confusion_matrix_train[0, 1]
FN_train = confusion_matrix_train[1, 0]
precision_manual_train = TP_train / (TP_train + FP_train)
recall_manual_train = TP_train / (TP_train + FN_train)
f1_manual_train = 2 * (precision_manual_\<EUGPSCoordinates\>train * recall_manual_\<EUGPSCoordinates\>train) / (precision_manual_\<EUGPSCoordinates\>train + recall_manual_\<EUGPSCoordinates\>train)
print(f"Training Recall (manually calculated): {recall_manual_\<EUGPSCoordinates\>train:.3f}")
print(f"Training F1 (manually calculated): {f1_manual_\<EUGPSCoordinates\>train:.3f}")

def calc_ks(data):
    data_pd = data.toPandas()
    data_pd['good'] = (data_pd['label'] == 0).astype(int)
    data_pd['bad'] = (data_pd['label'] == 1).astype(int)
    data_pd['bucket'] = (data_pd['score'].rank(pct=True) * 10).astype(int)
    grouped = data_pd.groupby('bucket', as_index=True)
    kstable = grouped.min().score.to_frame(name='min_score')
    kstable['max_score'] = grouped.max().score
    kstable['bads'] = grouped.sum().bad
    kstable['goods'] = grouped.sum().good
    kstable.reset_index(inplace=True)
    kstable['bad_rate'] = kstable.bads / (kstable.bads + kstable.goods)
    kstable['ks'] = (kstable.bads / kstable.bads.sum()).cumsum() - \
                    (kstable.goods / kstable.goods.sum()).cumsum()
    ks_value = kstable.ks.abs().max()
    return ks_value

# Calculate the KS statistic for the training data
score_udf = udf(lambda v: float(v[1]), DoubleType())
predictions_train = predictions_train.withColumn('score', score_udf('probability'))
ks_value_train = calc_ks(predictions_train)
print(f"Training KS: {ks_value_train:.3f}")

# Get the most important features
importances = model.bestModel.stages[-1].featureImportances
important_features = sorted(zip(importances, features), reverse=True)
print("Most important features:")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")
