<a href="https://colab.research.google.com/github/CristValen/Acciones-RNR/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the data into training and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Define features and label
features = df8.columns
features.remove('label')

assembler = VectorAssembler(inputCols=features, outputCol="features")

# Create the Random Forest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=seed)

# Create the pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()

# Define the evaluator for cross-validation
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Create the cross-validator object
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator)

# Fit the model on the training data
model = cv.fit(train)

# Make predictions on the training data
predictions_train = model.transform(train)

# Calculate ROC-AUC and accuracy metrics for training data
evaluator_train_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc_train = evaluator_train_roc_auc.evaluate(predictions_train)
evaluator_train_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_train = evaluator_train_accuracy.evaluate(predictions_train)

print(f"Training ROC-AUC: {roc_auc_train:.3f}")
print(f"Training Accuracy: {accuracy_train:.3f}")

# Calculate the confusion matrix for training data
predictionAndLabels_train = predictions_train.select("prediction", "label").rdd
metrics_train = MulticlassMetrics(predictionAndLabels_train)
confusion_matrix_train = metrics_train.confusionMatrix().toArray()
print(f"Training Confusion matrix:\n{confusion_matrix_train}")

# Manually calculate recall and F1 score for training data
TP_train = confusion_matrix_train[1, 1]
FP_train = confusion_matrix_train[0, 1]
FN_train = confusion_matrix_train[1, 0]
precision_manual_train = TP_train / (TP_train + FP_train)
recall_manual_train = TP_train / (TP_train + FN_train)
f1_manual_train = 2 * (precision_manual_train * recall_manual_train) / (precision_manual_train + recall_manual_train)
print(f"Training Recall (manually calculated): {recall_manual_train:.3f}")
print(f"Training F1 (manually calculated): {f1_manual_train:.3f}")

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['score'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().score.to_frame(name='min_score')
    kstable['max_score']=grouped.max().score
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

# Define a user-defined function to extract the probability of class 1
extract_probability = udf(lambda v: float(v[1]), DoubleType())

# Create a new column with the probability of class 1 in the predictions DataFrame
predictions_train = predictions_train.withColumn('score', extract_probability('probability'))

# Calculate the KS statistic for the training data
ks_value = calc_ks(predictions_train)
print(f"Training KS: {ks_value:.3f}")

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
from pyspark.sql.functions import col, udf, when, percent_rank
from pyspark.sql.types import DoubleType

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the data into training and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Define features and label
features = df8.columns
features.remove('label')

assembler = VectorAssembler(inputCols=features, outputCol="features")

# Create the Random Forest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=seed)

# Create the pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()

# Define the evaluator for cross-validation
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Create the cross-validator object
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator)

# Fit the model on the training data
model = cv.fit(train)

# Make predictions on the training data
predictions_train = model.transform(train)

# Make predictions on the test data
predictions_test = model.transform(test)

# Create a UDF to extract the score from the probability column
def extract_score(vector):
    return float(vector[1])

extract_score_udf = udf(extract_score, DoubleType())

# Create the score column in the predictions DataFrames
predictions_train = predictions_train.withColumn('score', extract_score_udf('probability'))
predictions_test = predictions_test.withColumn('score', extract_score_udf('probability'))

# Convert the predictions to an RDD and calculate metrics using MulticlassMetrics
predictionAndLabels_train = predictions_train.select("prediction", "label").rdd
metrics_train = MulticlassMetrics(predictionAndLabels_train)
confusion_matrix_train = metrics_train.confusionMatrix().toArray()
print(f"Training Confusion matrix:\n{confusion_matrix_train}")

predictionAndLabels_test = predictions_test.select("prediction", "label").rdd
metrics_test = MulticlassMetrics(predictionAndLabels_test)
confusion_matrix_test = metrics_test.confusionMatrix().toArray()
print(f"Test Confusion matrix:\n{confusion_matrix_test}")

# Manually calculate recall and F1 score for training data
TP_train = confusion_matrix_train[1, 1]
FP_train = confusion_matrix_train[0, 1]
FN_train = confusion_matrix_train[1, 0]

precision_train = TP_train / (TP_train + FP_train)
recall_train = TP_train / (TP_train + FN_train)
f1_score_train = 2 * (precision_train * recall_train) / (precision_train + recall_train)

print(f"Training Precision: {precision_train}")
print(f"Training Recall: {recall_train}")
print(f"Training F1 Score: {f1_score_train}")

# Manually calculate recall and F1 score for test data
TP_test = confusion_matrix_test[1, 1]
FP_test = confusion_matrix_test[0, 1]
FN_test = confusion_matrix_test[1, 0]

precision_test = TP_test / (TP_test + FP_test)
recall_test = TP_test / (TP_test + FN_test)
f1_score_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)

print(f"Test Precision: {precision_test}")
print(f"Test Recall: {recall_test}")
print(f"Test F1 Score: {f1_score_test}")

def calc_ks(data):
    data_pd=data.withColumn('good', when(col('label') == 0, 1).otherwise(0)) \
                .withColumn('bad', when(col('label') == 1, 1).otherwise(0)) \
                .withColumn('bucket', (percent_rank().over(Window.orderBy('score'))*10).cast(IntegerType()))
    grouped=data_pd.groupBy('bucket')
    kstable=grouped.agg(min(col('score')).alias('min_score'), max(col('score')).alias('max_score'), sum(col('bad')).alias('bads'), sum(col('good')).alias('goods'))
    kstable=kstable.withColumn('bad_rate', col('bads')/(col('bads')+col('goods')))
    kstable=kstable.withColumn('ks', (sum(col('bads')).over(Window.orderBy('bucket'))/kstable.select(sum(col('bads'))).collect()[0][0])-(sum(col('goods')).over(Window.orderBy('bucket'))/kstable.select(sum(col('goods'))).collect()[0][0]))
    ks_value=kstable.select(max(abs(col('ks')))).collect()[0][0]
    return ks_value

ks_value_train=calc_ks(predictions_train)
ks_value_test=calc_ks(predictions_test)

print(f"Training KS: {ks_value_train}")
print(f"Test KS: {ks_value_test}")



In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from pyspark.sql import Row
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Set the seed for the random number generator
random_state = 0

# Convert the Spark DataFrame to a Pandas DataFrame
data_pd = df_2.toPandas()

# Define the feature columns
feature_cols = [col for col in data_pd.columns if col != 'Malo_Dias_tot']

# Extract the feature matrix and label vector
X = data_pd[feature_cols].values
y = data_pd['Malo_Dias_tot'].values

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create a SMOTE object
smote = SMOTE(random_state=random_state)

# Train the Decision Tree model without cross-validation
dt = DecisionTreeClassifier(random_state=random_state)

# Create a pipeline to chain SMOTE and Decision Tree together
pipeline = Pipeline([('smote', smote), ('dt', dt)])

# Fit the pipeline to the training data
model = pipeline.fit(X_train, y_train)

# Perform SMOTE oversampling on the test data
X_test_resampled, y_test_resampled = smote.fit_resample(X_test, y_test)

# Predict values for the oversampled test set
y_pred = model.predict(X_train_resampled)

# Calculate metrics
tn, fp, fn, tp = confusion_matrix(y_train_resampled, y_pred).ravel()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = accuracy_score(y_train_resampled, y_pred)
precision = precision_score(y_train_resampled, y_pred)
recall = recall_score(y_train_resampled, y_pred)
f1_score = f1_score(y_train_resampled, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Convert the true labels and predicted scores to int data type
y_train_resampled = y_train_resampled.astype(int)
y_pred = y_pred.astype(int)

# Create a list of Row objects containing the true labels and predicted scores
rows = [Row(label=int(label), score=int(score)) for label, score in zip(y_train_resampled, y_pred)]

# Convert the list of Row objects to a PySpark DataFrame
predictions_train = spark.createDataFrame(rows)

def calc_ks(data):
    data_pd=data.withColumn('good', when(col('label') == 0, 1).otherwise(0)) \
                .withColumn('bad', when(col('label') == 1, 1).otherwise(0)) \
                .withColumn('bucket', (percent_rank().over(Window.orderBy('score'))*10).cast(IntegerType()))
    grouped=data_pd.groupBy('bucket')
    kstable=grouped.agg(min(col('score')).alias('min_score'), max(col('score')).alias('max_score'), sum(col('bad')).alias('bads'), sum(col('good')).alias('goods'))
    kstable=kstable.withColumn('bad_rate', col('bads')/(col('bads')+col('goods')))
    kstable=kstable.withColumn('ks', (sum(col('bads')).over(Window.orderBy('bucket'))/kstable.select(sum(col('bads'))).collect()[0][0])-(sum(col('goods')).over(Window.orderBy('bucket'))/kstable.select(sum(col('goods'))).collect()[0][0]))
    ks_value=kstable.select(max(abs(col('ks')))).collect()[0][0]
    return ks_value

ks_value_train=calc_ks(predictions_train)

print(f"Training KS: {ks_value_train}")




In [None]:
### con funcion

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the data into training and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Define features and label
features = df8.columns
features.remove('label')

assembler = VectorAssembler(inputCols=features, outputCol="features")

# Create the Random Forest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=seed)

# Create the pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()

# Define the evaluator for cross-validation
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Create the cross-validator object
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator)

# Fit the model on the training data
model = cv.fit(train)

def calculate_metrics(predictions):
    # Calculate ROC-AUC and accuracy metrics
    evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
    roc_auc = evaluator_roc_auc.evaluate(predictions)
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_accuracy.evaluate(predictions)

    # Calculate the confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Manually calculate recall and F1 score
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    precision_manual = TP / (TP + FP)
    recall_manual = TP / (TP + FN)
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)

    return roc_auc, accuracy, precision_manual, recall_manual, f1_manual, confusion_matrix

# Make predictions on the training data and calculate metrics for training data
predictions_train = model.transform(train)
train_roc_auc, train_accuracy, train_precision, train_recall, train_f1Score, train_confusion_matrix = calculate_metrics(predictions_train)

# Print training metrics
print(f"Training ROC-AUC: {train_roc_auc:.3f}")
print(f"Training Accuracy: {train_accuracy:.3f}")
print(f"Training Precision: {train_precision:.3f}")
print(f"Training Recall: {train_recall:.3f}")
print(f"Training F1: {train_f1Score:.3f}")
print(f"Training Confusion matrix:\n{train_confusion_matrix}")

# Make predictions on the test data and calculate metrics for test data
predictions_test = model.transform(test)
test_roc_auc, test_accuracy, test_precision, test_recall, test_f1Score, test_confusion_matrix = calculate_metrics(predictions_test)

# Print test metrics
print(f"Test ROC-AUC: {test_roc_auc:.3f}")
print(f"Test Accuracy: {test_accuracy:.3f}")
print(f"Test Precision: {test_precision:.3f}")
print(f"Test Recall: {test_recall:.3f}")
print(f"Test F1: {test_f1Score:.3f}")
print(f"Test Confusion matrix:\n{test_confusion_matrix}")

# Make predictions on the df_oot data and calculate metrics for df_oot data
predictions_oot = model.transform(df_oot)
oot_roc_auc, oot_accuracy, oot_precision, oot_recall, oot_f1Score, oot_confusion_matrix = calculate_metrics(predictions_oot)

# Print df_oot metrics
print(f"df_oot ROC-AUC: {oot_roc_auc:.3f}")
print(f"df_oot Accuracy: {oot_accuracy:.3f}")
print(f"df_oot Precision: {oot_precision:.3f}")
print(f"df_oot Recall: {oot_recall:.3f}")
print(f"df_oot F1: {oot_f1Score:.3f}")
print(f"df_oot Confusion matrix:\n{oot_confusion_matrix}")


In [None]:

# Establecer la semilla para el generador de números aleatorios
random_state = 0

# Calcular el número de ejemplos en cada clase
class_counts = df_2.groupBy('Malo_Dias_tot').count().collect()
num_positives = class_counts[1][1]
num_negatives = class_counts[0][1]

# Calcular el número de ejemplos negativos a mantener
num_to_keep = int(num_positives / num_negatives * num_negatives)

# Seleccionar un subconjunto aleatorio de la clase mayoritaria
majority_subset = df_2.filter(col('Malo_Dias_tot') == 0).orderBy(rand(seed=random_state)).limit(num_to_keep)

# Combinar el subconjunto mayoritario con la clase minoritaria para crear los datos submuestreados
undersampled_data = majority_subset.union(df_2.filter(col('Malo_Dias_tot') == 1))

# Dividir el conjunto de datos en conjuntos de entrenamiento y prueba
train, test = undersampled_data.randomSplit([0.7, 0.3], seed=random_state)

# Definir las columnas de características
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Crear un VectorAssembler para combinar las columnas de características en una sola columna vectorial
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Crear un StandardScaler para estandarizar las características
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Entrenar el modelo de Máquina de Vectores de Soporte sin validación cruzada
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Crear un pipeline para encadenar el ensamblador, el escalador y el SVM juntos
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Ajustar el pipeline a los datos de entrenamiento
model = pipeline.fit(train)


def calculate_metrics(predictions):
    # Calculate ROC-AUC and accuracy metrics
    evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
    roc_auc = evaluator_roc_auc.evaluate(predictions)
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_accuracy.evaluate(predictions)

    # Calculate the confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Manually calculate recall and F1 score
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    precision_manual = TP / (TP + FP)
    recall_manual = TP / (TP + FN)
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)

    return roc_auc, accuracy, precision_manual, recall_manual, f1_manual, confusion_matrix

# Make predictions on the training data and calculate metrics for training data
predictions_train = model.transform(train)
train_roc_auc, train_accuracy, train_precision, train_recall, train_f1Score, train_confusion_matrix = calculate_metrics(predictions_train)

# Print training metrics
print(f"Training ROC-AUC: {train_roc_auc:.3f}")
print(f"Training Accuracy: {train_accuracy:.3f}")
print(f"Training Precision: {train_precision:.3f}")
print(f"Training Recall: {train_recall:.3f}")
print(f"Training F1: {train_f1Score:.3f}")
print(f"Training Confusion matrix:\n{train_confusion_matrix}")

# Make predictions on the test data and calculate metrics for test data
predictions_test = model.transform(test)
test_roc_auc, test_accuracy, test_precision, test_recall, test_f1Score, test_confusion_matrix = calculate_metrics(predictions_test)

# Print test metrics
print(f"Test ROC-AUC: {test_roc_auc:.3f}")
print(f"Test Accuracy: {test_accuracy:.3f}")
print(f"Test Precision: {test_precision:.3f}")
print(f"Test Recall: {test_recall:.3f}")
print(f"Test F1: {test_f1Score:.3f}")
print(f"Test Confusion matrix:\n{test_confusion_matrix}")

# Make predictions on the df_oot data and calculate metrics for df_oot data
predictions_oot = model.transform(df_oot)
oot_roc_auc, oot_accuracy, oot_precision, oot_recall, oot_f1Score, oot_confusion_matrix = calculate_metrics(predictions_oot)

# Print df_oot metrics
print(f"df_oot ROC-AUC: {oot_roc_auc:.3f}")
print(f"df_oot Accuracy: {oot_accuracy:.3f}")
print(f"df_oot Precision: {oot_precision:.3f}")
print(f"df_oot Recall: {oot_recall:.3f}")
print(f"df_oot F1: {oot_f1Score:.3f}")
print(f"df_oot Confusion matrix:\n{oot_confusion_matrix}")

#no reproducible

In [None]:
#intentar que sea reproducible
random_state = 0

# Calcular el número de ejemplos en cada clase
class_counts = df_2.groupBy('Malo_Dias_tot').count().collect()
num_positives = class_counts[1][1]
num_negatives = class_counts[0][1]

# Calcular el número de ejemplos negativos a mantener
num_to_keep = int(num_positives / num_negatives * num_negatives)

# Seleccionar un subconjunto aleatorio de la clase mayoritaria
majority_subset = df_2.filter(col('Malo_Dias_tot') == 0).orderBy(rand(seed=random_state)).limit(num_to_keep)

# Combinar el subconjunto mayoritario con la clase minoritaria para crear los datos submuestreados
undersampled_data = majority_subset.union(df_2.filter(col('Malo_Dias_tot') == 1))

# Dividir el conjunto de datos en conjuntos de entrenamiento y prueba
train, test = undersampled_data.randomSplit([0.7, 0.3], seed=random_state)

# Definir las columnas de características
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Crear un VectorAssembler para combinar las columnas de características en una sola columna vectorial
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Crear un StandardScaler para estandarizar las características
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Entrenar el modelo de Máquina de Vectores de Soporte sin validación cruzada
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Crear un pipeline para encadenar el ensamblador, el escalador y el SVM juntos
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Ajustar el pipeline a los datos de entrenamiento
model = pipeline.fit(train)


def calculate_metrics(predictions):
    # Calculate ROC-AUC and accuracy metrics
    evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
    roc_auc = evaluator_roc_auc.evaluate(predictions)
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_accuracy.evaluate(predictions)

    # Calculate the confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Manually calculate recall and F1 score
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    precision_manual = TP / (TP + FP)
    recall_manual = TP / (TP + FN)
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)

    return roc_auc, accuracy, precision_manual, recall_manual, f1_manual, confusion_matrix

# Make predictions on the training data and calculate metrics for training data
predictions_train = model.transform(train)
train_roc_auc, train_accuracy, train_precision, train_recall, train_f1Score, train_confusion_matrix = calculate_metrics(predictions_train)

# Print training metrics
print(f"Training ROC-AUC: {train_roc_auc:.3f}")
print(f"Training Accuracy: {train_accuracy:.3f}")
print(f"Training Precision: {train_precision:.3f}")
print(f"Training Recall: {train_recall:.3f}")
print(f"Training F1: {train_f1Score:.3f}")
print(f"Training Confusion matrix:\n{train_confusion_matrix}")

# Make predictions on the test data and calculate metrics for test data
predictions_test = model.transform(test)
test_roc_auc, test_accuracy, test_precision, test_recall, test_f1Score, test_confusion_matrix = calculate_metrics(predictions_test)

# Print test metrics
print(f"Test ROC-AUC: {test_roc_auc:.3f}")
print(f"Test Accuracy: {test_accuracy:.3f}")
print(f"Test Precision: {test_precision:.3f}")
print(f"Test Recall: {test_recall:.3f}")
print(f"Test F1: {test_f1Score:.3f}")
print(f"Test Confusion matrix:\n{test_confusion_matrix}")

# Make predictions on the df_oot data and calculate metrics for df_oot data
predictions_oot = model.transform(df_oot)
oot_roc_auc, oot_accuracy, oot_precision, oot_recall, oot_f1Score, oot_confusion_matrix = calculate_metrics(predictions_oot)

# Print df_oot metrics
print(f"df_oot ROC-AUC: {oot_roc_auc:.3f}")
print(f"df_oot Accuracy: {oot_accuracy:.3f}")
print(f"df_oot Precision: {oot_precision:.3f}")
print(f"df_oot Recall: {oot_recall:.3f}")
print(f"df_oot F1: {oot_f1Score:.3f}")
print(f"df_oot Confusion matrix:\n{oot_confusion_matrix}")


In [None]:
from imblearn.over_sampling import SMOTE
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics

# Rename the label column and cast it to DoubleType
df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the dataset into train and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Define the feature columns
feature_cols = [col for col in df8.columns if col != 'label']

# Extract the feature matrix and label vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train = assembler.transform(train)
test = assembler.transform(test)

# Convert the train and test DataFrames to Pandas DataFrames
train_pd = train.toPandas()
test_pd = test.toPandas()

# Create a SMOTE object
smote = SMOTE(random_state=seed)

# Perform SMOTE oversampling on the train and test data
X_train_resampled, y_train_resampled = smote.fit_resample(train_pd[feature_cols], train_pd['label'])
X_test_resampled, y_test_resampled = smote.fit_resample(test_pd[feature_cols], test_pd['label'])

# Convert the resampled train and test data back to Spark DataFrames
train_resampled_pd = pd.concat([pd.DataFrame(X_train_resampled, columns=feature_cols), pd.Series(y_train_resampled, name='label')], axis=1)
train_resampled = spark.createDataFrame(train_resampled_pd)
train_resampled = assembler.transform(train_resampled)

test_resampled_pd = pd.concat([pd.DataFrame(X_test_resampled, columns=feature_cols), pd.Series(y_test_resampled, name='label')], axis=1)
test_resampled = spark.createDataFrame(test_resampled_pd)
test_resampled = assembler.transform(test_resampled)

# Train the Decision Tree model without cross-validation
dt = DecisionTreeClassifier(seed=seed)

# Fit the model to the resampled training data
model = dt.fit(train_resampled)

def calculate_metrics(predictions):
    # Calculate ROC-AUC and accuracy metrics
    evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
    roc_auc = evaluator_roc_auc.evaluate(predictions)
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_accuracy.evaluate(predictions)

    # Calculate the confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Manually calculate recall and F1 score
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    precision_manual = TP / (TP + FP)
    recall_manual = TP / (TP + FN)
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)

    return roc_auc, accuracy, precision_manual, recall_manual, f1_manual, confusion_matrix

# Make predictions on the resampled training data and calculate metrics for training data
predictions_train = model.transform(train_resampled)
train_roc_auc, train_accuracy, train_precision, train_recall, train_f1Score, train_confusion_matrix = calculate_metrics(predictions_train)

# Print training metrics
print(f"Training ROC-AUC: {train_roc_auc:.3f}")
print(f"Training Accuracy: {train_accuracy:.3f}")
print(f"Training Precision: {train_precision:.3f}")
print(f"Training Recall: {train_recall:.3f}")
print(f"Training F1: {train_f1Score:.3f}")
print(f"Training Confusion matrix:\n{train_confusion_matrix}")

# Make predictions on the resampled test data and calculate metrics for test data
predictions_test = model.transform(test_resampled)
test_roc_auc, test_accuracy, test_precision, test_recall, test_f1Score, test_confusion_matrix = calculate_metrics(predictions_test)

# Print test metrics
print(f"Test ROC-AUC: {test_roc_auc:.3f}")
print(f"Test Accuracy: {test_accuracy:.3f}")
print(f"Test Precision: {test_precision:.3f}")
print(f"Test Recall: {test_recall:.3f}")
print(f"Test F1: {test_f1Score:.3f}")
print(f"Test Confusion matrix:\n{test_confusion_matrix}")

# Extract the feature matrix and label vector for df_oot2
df_oot2 = assembler.transform(df_oot2)

# Make predictions on the df_oot2 data and calculate metrics for df_oot2 data
predictions_oot2 = model.transform(df_oot2)
oot2_roc_auc, oot2_accuracy, oot2_precision, oot2_recall, oot2_f1Score, oot2_confusion_matrix = calculate_metrics(predictions_oot2)

# Print df_oot2 metrics
print(f"df_oot2 ROC-AUC: {oot2_roc_auc:.3f}")
print(f"df_oot2 Accuracy: {oot2_accuracy:.3f}")
print(f"df_oot2 Precision: {oot2_precision:.3f}")
print(f"df_oot2 Recall: {oot2_recall:.3f}")
print(f"df_oot2 F1: {oot2_f1Score:.3f}")
print(f"df_oot2 Confusion matrix:\n{oot2_confusion_matrix}")

# Extract the feature importances from the trained model
importances = model.featureImportances

# Create a dictionary of feature names and their importances
importance_dict = dict(zip(feature_cols, importances))

# Sort the dictionary by importance in descending order
sorted_importance_dict = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

# Extract the top 10 most important features and their importance values
top_10_features = sorted_importance_dict[:10]

# Print the top 10 most important features and their importance values
print("Top 10 most important features:")
for feature, importance in top_10_features:
    print(f"{feature}: {importance:.3f}")






In [None]:
from imblearn.over_sampling import ADASYN
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics

# Rename the label column and cast it to DoubleType
df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the dataset into train and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Define the feature columns
feature_cols = [col for col in df8.columns if col != 'label']

# Extract the feature matrix and label vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train = assembler.transform(train)
test = assembler.transform(test)

# Convert the train and test DataFrames to Pandas DataFrames
train_pd = train.toPandas()
test_pd = test.toPandas()

# Create an ADASYN object
adasyn = ADASYN(random_state=seed)

# Perform ADASYN oversampling on the train and test data
X_train_resampled, y_train_resampled = adasyn.fit_resample(train_pd[feature_cols], train_pd['label'])
X_test_resampled, y_test_resampled = adasyn.fit_resample(test_pd[feature_cols], test_pd['label'])

# Convert the resampled train and test data back to Spark DataFrames
train_resampled_pd = pd.concat([pd.DataFrame(X_train_resampled, columns=feature_cols), pd.Series(y_train_resampled, name='label')], axis=1)
train_resampled = spark.createDataFrame(train_resampled_pd)
train_resampled = assembler.transform(train_resampled)

test_resampled_pd = pd.concat([pd.DataFrame(X_test_resampled, columns=feature_cols), pd.Series(y_test_resampled, name='label')], axis=1)
test_resampled = spark.createDataFrame(test_resampled_pd)
test_resampled = assembler.transform(test_resampled)

# Train the Decision Tree model without cross-validation
dt = DecisionTreeClassifier(seed=seed)

# Fit the model to the resampled training data
model = dt.fit(train_resampled)

def calculate_metrics(predictions):
    # Calculate ROC-AUC and accuracy metrics
    evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
    roc_auc = evaluator_roc_auc.evaluate(predictions)
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_accuracy.evaluate(predictions)

    # Calculate the confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Manually calculate recall and F1 score
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    precision_manual = TP / (TP + FP)
    recall_manual = TP / (TP + FN)
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)

    return roc_auc, accuracy, precision_manual, recall_manual, f1_manual, confusion_matrix

# Make predictions on the resampled training data and calculate metrics for training data
predictions_train = model.transform(train_resampled)
train_roc_auc, train_accuracy, train_precision, train_recall, train_f1Score, train_confusion_matrix = calculate_metrics(predictions_train)

# Print training metrics
print(f"Training ROC-AUC: {train_roc_auc:.3f}")
print(f"Training Accuracy: {train_accuracy:.3f}")
print(f"Training Precision: {train_precision:.3f}")
print(f"Training Recall: {train_recall:.3f}")
print(f"Training F1: {train_f1Score:.3f}")
print(f"Training Confusion matrix:\n{train_confusion_matrix}")

# Make predictions on the resampled test data and calculate metrics for test data
predictions_test = model.transform(test_resampled)
test_roc_auc, test_accuracy, test_precision, test_recall, test_f1Score, test_confusion_matrix = calculate_metrics(predictions_test)

# Print test metrics
print(f"Test ROC-AUC: {test_roc_auc:.3f}")
print(f"Test Accuracy: {test_accuracy:.3f}")
print(f"Test Precision: {test_precision:.3f}")
print(f"Test Recall: {test_recall:.3f}")
print(f"Test F1: {test_f1Score:.3f}")
print(f"Test Confusion matrix:\n{test_confusion_matrix}")

# Extract the feature matrix and label vector for df_oot2
df_oot2 = assembler.transform(df_oot2)

# Make predictions on the df_oot2 data and calculate metrics for df_oot2 data
predictions_oot2 = model.transform(df_oot2)
oot2_roc_auc, oot2_accuracy, oot2_precision, oot2_recall, oot2_f1Score, oot2_confusion_matrix = calculate_metrics(predictions_oot2)

# Print df_oot2 metrics
print(f"df_oot2 ROC-AUC: {oot2_roc_auc:.3f}")
print(f"df_oot2 Accuracy: {oot2_accuracy:.3f}")
print(f"df_oot2 Precision: {oot2_precision:.3f}")
print(f"df_oot2 Recall: {oot2_recall:.3f}")
print(f"df_oot2 F1: {oot2_f1Score:.3f}")
print(f"df_oot2 Confusion matrix:\n{oot2_confusion_matrix}")

# Extract the feature importances from the trained model
importances = model.featureImportances

# Create a dictionary of feature names and their importances
importance_dict = dict(zip(feature_cols, importances))

# Sort the dictionary by importance in descending order
sorted_importance_dict = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

# Extract the top 10 most important features and their importance values
top_10_features = sorted_importance_dict[:10]

# Print the top 10 most important features and their importance values
print("Top 10 most important features:")
for feature, importance in top_10_features:
    print(f"{feature}: {importance:.3f}")


In [None]:
from imblearn.under_sampling import TomekLinks
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics

# Rename the label column and cast it to DoubleType
df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the dataset into train and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Define the feature columns
feature_cols = [col for col in df8.columns if col != 'label']

# Extract the feature matrix and label vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train = assembler.transform(train)
test = assembler.transform(test)

# Convert the train and test DataFrames to Pandas DataFrames
train_pd = train.toPandas()
test_pd = test.toPandas()

# Create a TomekLinks object
tl = TomekLinks()

# Perform Tomek Links undersampling on the train and test data
X_train_resampled, y_train_resampled = tl.fit_resample(train_pd[feature_cols], train_pd['label'])
X_test_resampled, y_test_resampled = tl.fit_resample(test_pd[feature_cols], test_pd['label'])

# Convert the resampled train and test data back to Spark DataFrames
train_resampled_pd = pd.concat([pd.DataFrame(X_train_resampled, columns=feature_cols), pd.Series(y_train_resampled, name='label')], axis=1)
train_resampled = spark.createDataFrame(train_resampled_pd)
train_resampled = assembler.transform(train_resampled)

test_resampled_pd = pd.concat([pd.DataFrame(X_test_resampled, columns=feature_cols), pd.Series(y_test_resampled, name='label')], axis=1)
test_resampled = spark.createDataFrame(test_resampled_pd)
test_resampled = assembler.transform(test_resampled)

# Train the Decision Tree model without cross-validation
dt = DecisionTreeClassifier(seed=seed)

# Fit the model to the resampled training data
model = dt.fit(train_resampled)

def calculate_metrics(predictions):
    # Calculate ROC-AUC and accuracy metrics
    evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
    roc_auc = evaluator_roc_auc.evaluate(predictions)
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_accuracy.evaluate(predictions)

    # Calculate the confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Manually calculate recall and F1 score
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    precision_manual = TP / (TP + FP)
    recall_manual = TP / (TP + FN)
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)

    return roc_auc, accuracy, precision_manual, recall_manual, f1_manual, confusion_matrix

# Make predictions on the resampled training data and calculate metrics for training data
predictions_train = model.transform(train_resampled)
train_roc_auc, train_accuracy, train_precision, train_recall, train_f1Score, train_confusion_matrix = calculate_metrics(predictions_train)

# Print training metrics
print(f"Training ROC-AUC: {train_roc_auc:.3f}")
print(f"Training Accuracy: {train_accuracy:.3f}")
print(f"Training Precision: {train_precision:.3f}")
print(f"Training Recall: {train_recall:.3f}")
print(f"Training F1: {train_f1Score:.3f}")
print(f"Training Confusion matrix:\n{train_confusion_matrix}")

# Make predictions on the resampled test data and calculate metrics for test data
predictions_test = model.transform(test_resampled)
test_roc_auc, test_accuracy, test_precision, test_recall, test_f1Score, test_confusion_matrix = calculate_metrics(predictions_test)

# Print test metrics
print(f"Test ROC-AUC: {test_roc_auc:.3f}")
print(f"Test Accuracy: {test_accuracy:.3f}")
print(f"Test Precision: {test_precision:.3f}")
print(f"Test Recall: {test_recall:.3f}")
print(f"Test F1: {test_f1Score:.3f}")
print(f"Test Confusion matrix:\n{test_confusion_matrix}")

# Extract the feature matrix and label vector for df_oot2
df_oot2 = assembler.transform(df_oot2)

# Make predictions on the df_oot2 data and calculate metrics for df_oot2 data
predictions_oot2 = model.transform(df_oot2)
oot2_roc_auc, oot2_accuracy, oot2_precision, oot2_recall, oot2_f1Score, oot2_confusion_matrix = calculate_metrics(predictions_oot2)

# Print df_oot2 metrics
print(f"df_oot2 ROC-AUC: {oot2_roc_auc:.3f}")
print(f"df_oot2 Accuracy: {oot2_accuracy:.3f}")
print(f"df_oot2 Precision: {oot2_precision:.3f}")
print(f"df_oot2 Recall: {oot2_recall:.3f}")
print(f"df_oot2 F1: {oot2_f1Score:.3f}")
print(f"df_oot2 Confusion matrix:\n{oot2_confusion_matrix}")

# Extract the feature importances from the trained model
importances = model.featureImportances

# Create a dictionary of feature names and their importances
importance_dict = dict(zip(feature_cols, importances))

# Sort the dictionary by importance in descending order
sorted_importance_dict = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

# Extract the top 10 most important features and their importance values
top_10_features = sorted_importance_dict[:10]

# Print the top 10 most important features and their importance values
print("Top 10 most important features:")
for feature, importance in top_10_features:
    print(f"{feature}: {importance:.3f}")


In [None]:
#smote sin cross
from imblearn.over_sampling import SMOTE
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics

# Rename the label column and cast it to DoubleType
df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the dataset into train and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Define the feature columns
feature_cols = [col for col in df8.columns if col != 'label']

# Extract the feature matrix and label vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train = assembler.transform(train)
test = assembler.transform(test)

# Convert the train and test DataFrames to Pandas DataFrames
train_pd = train.toPandas()
test_pd = test.toPandas()

# Create a SMOTE object
smote = SMOTE(random_state=seed)

# Perform SMOTE oversampling on the train and test data
X_train_resampled, y_train_resampled = smote.fit_resample(train_pd[feature_cols], train_pd['label'])
X_test_resampled, y_test_resampled = smote.fit_resample(test_pd[feature_cols], test_pd['label'])

# Convert the resampled train and test data back to Spark DataFrames
train_resampled_pd = pd.concat([pd.DataFrame(X_train_resampled, columns=feature_cols), pd.Series(y_train_resampled, name='label')], axis=1)
train_resampled = spark.createDataFrame(train_resampled_pd)
train_resampled = assembler.transform(train_resampled)

test_resampled_pd = pd.concat([pd.DataFrame(X_test_resampled, columns=feature_cols), pd.Series(y_test_resampled, name='label')], axis=1)
test_resampled = spark.createDataFrame(test_resampled_pd)
test_resampled = assembler.transform(test_resampled)

# Train the GBT model without cross-validation
gbt = GBTClassifier(seed=seed)

# Fit the model to the resampled training data
model = gbt.fit(train_resampled)

def calculate_metrics(predictions):
    # Calculate ROC-AUC and accuracy metrics
    evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
    roc_auc = evaluator_roc_auc.evaluate(predictions)
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_accuracy.evaluate(predictions)

    # Calculate the confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Manually calculate recall and F1 score
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    precision_manual = TP / (TP + FP)
    recall_manual = TP / (TP + FN)
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)

    return roc_auc, accuracy, precision_manual, recall_manual, f1_manual, confusion_matrix

# Make predictions on the resampled training data and calculate metrics for training data
predictions_train = model.transform(train_resampled)
train_roc_auc, train_accuracy, train_precision, train_recall, train_f1Score, train_confusion_matrix = calculate_metrics(predictions_train)

# Print training metrics
print(f"Training ROC-AUC: {train_roc_auc:.3f}")
print(f"Training Accuracy: {train_accuracy:.3f}")
print(f"Training Precision: {train_precision:.3f}")
print(f"Training Recall: {train_recall:.3f}")
print(f"Training F1: {train_f1Score:.3f}")
print(f"Training Confusion matrix:\n{train_confusion_matrix}")

# Make predictions on the resampled test data and calculate metrics for test data
predictions_test = model.transform(test_resampled)
test_roc_auc, test_accuracy, test_precision, test_recall, test_f1Score, test_confusion_matrix = calculate_metrics(predictions_test)

# Print test metrics
print(f"Test ROC-AUC: {test_roc_auc:.3f}")
print(f"Test Accuracy: {test_accuracy:.3f}")
print(f"Test Precision: {test_precision:.3f}")
print(f"Test Recall: {test_recall:.3f}")
print(f"Test F1: {test_f1Score:.3f}")
print(f"Test Confusion matrix:\n{test_confusion_matrix}")

# Extract the feature matrix and label vector for df_oot2
df_oot2 = assembler.transform(df_oot2)

# Make predictions on the df_oot2 data and calculate metrics for df_oot2 data
predictions_oot2 = model.transform(df_oot2)
oot2_roc_auc, oot2_accuracy, oot2_precision, oot2_recall, oot2_f1Score, oot2_confusion_matrix = calculate_metrics(predictions_oot2)

# Print df_oot2 metrics
print(f"df_oot2 ROC-AUC: {oot2_roc_auc:.3f}")
print(f"df_oot2 Accuracy: {oot2_accuracy:.3f}")
print(f"df_oot2 Precision: {oot2_precision:.3f}")
print(f"df_oot2 Recall: {oot2_recall:.3f}")
print(f"df_oot2 F1: {oot2_f1Score:.3f}")
print(f"df_oot2 Confusion matrix:\n{oot2_confusion_matrix}")

# Extract the feature importances from the trained model
importances = model.featureImportances

# Create a dictionary of feature names and their importances
importance_dict = dict(zip(feature_cols, importances))

# Sort the dictionary by importance in descending order
sorted_importance_dict = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

# Extract the top 10 most important features and their importance values
top_10_features = sorted_importance_dict[:10]

# Print the top 10 most important features and their importance values
print("Top 10 most important features:")
for feature, importance in top_10_features:
    print(f"{feature}: {importance:.3f}")


In [None]:
#smote con cross
from imblearn.over_sampling import SMOTE
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics

# Rename the label column and cast it to DoubleType
df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the dataset into train and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Define the feature columns
feature_cols = [col for col in df8.columns if col != 'label']

# Extract the feature matrix and label vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train = assembler.transform(train)
test = assembler.transform(test)

# Convert the train and test DataFrames to Pandas DataFrames
train_pd = train.toPandas()
test_pd = test.toPandas()

# Create a SMOTE object
smote = SMOTE(random_state=seed)

# Perform SMOTE oversampling on the train and test data
X_train_resampled, y_train_resampled = smote.fit_resample(train_pd[feature_cols], train_pd['label'])
X_test_resampled, y_test_resampled = smote.fit_resample(test_pd[feature_cols], test_pd['label'])

# Convert the resampled train and test data back to Spark DataFrames
train_resampled_pd = pd.concat([pd.DataFrame(X_train_resampled, columns=feature_cols), pd.Series(y_train_resampled, name='label')], axis=1)
train_resampled = spark.createDataFrame(train_resampled_pd)
train_resampled = assembler.transform(train_resampled)

test_resampled_pd = pd.concat([pd.DataFrame(X_test_resampled, columns=feature_cols), pd.Series(y_test_resampled, name='label')], axis=1)
test_resampled = spark.createDataFrame(test_resampled_pd)
test_resampled = assembler.transform(test_resampled)

# Train the GBT model with cross-validation
gbt = GBTClassifier(seed=seed)

# Create a parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [2, 5, 10]) \
    .addGrid(gbt.maxIter, [10, 20]) \
    .build()

# Create a BinaryClassificationEvaluator object for evaluating the model during cross-validation
evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Create a CrossValidator object for performing cross-validation
cv = CrossValidator(estimator=gbt,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator_roc_auc,
                    numFolds=5,
                    seed=seed)

# Fit the model to the resampled training data using cross-validation
model = cv.fit(train_resampled)

def calculate_metrics(predictions):
    # Calculate ROC-AUC and accuracy metrics
    evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
    roc_auc = evaluator_roc_auc.evaluate(predictions)
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_accuracy.evaluate(predictions)

    # Calculate the confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Manually calculate recall and F1 score
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    precision_manual = TP / (TP + FP)
    recall_manual = TP / (TP + FN)
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)

    return roc_auc, accuracy, precision_manual, recall_manual, f1_manual, confusion_matrix

# Make predictions on the resampled training data and calculate metrics for training data
predictions_train = model.transform(train_resampled)
train_roc_auc, train_accuracy, train_precision, train_recall, train_f1Score, train_confusion_matrix = calculate_metrics(predictions_train)

# Print training metrics
print(f"Training ROC-AUC: {train_roc_auc:.3f}")
print(f"Training Accuracy: {train_accuracy:.3f}")
print(f"Training Precision: {train_precision:.3f}")
print(f"Training Recall: {train_recall:.3f}")
print(f"Training F1: {train_f1Score:.3f}")
print(f"Training Confusion matrix:\n{train_confusion_matrix}")

# Make predictions on the resampled test data and calculate metrics for test data
predictions_test = model.transform(test_resampled)
test_roc_auc, test_accuracy, test_precision, test_recall, test_f1Score, test_confusion_matrix = calculate_metrics(predictions_test)

# Print test metrics
print(f"Test ROC-AUC: {test_roc_auc:.3f}")
print(f"Test Accuracy: {test_accuracy:.3f}")
print(f"Test Precision: {test_precision:.3f}")
print(f"Test Recall: {test_recall:.3f}")
print(f"Test F1: {test_f1Score:.3f}")
print(f"Test Confusion matrix:\n{test_confusion_matrix}")

# Extract the feature matrix and label vector for df_oot2
df_oot2 = assembler.transform(df_oot2)

# Make predictions on the df_oot2 data and calculate metrics for df_oot2 data
predictions_oot2 = model.transform(df_oot2)
oot2_roc_auc, oot2_accuracy, oot2_precision, oot2_recall, oot2_f1Score, oot2_confusion_matrix = calculate_metrics(predictions_oot2)

# Print df_oot2 metrics
print(f"df_oot2 ROC-AUC: {oot2_roc_auc:.3f}")
print(f"df_oot2 Accuracy: {oot2_accuracy:.3f}")
print(f"df_oot2 Precision: {oot2_precision:.3f}")
print(f"df_oot2 Recall: {oot2_recall:.3f}")
print(f"df_oot2 F1: {oot2_f1Score:.3f}")
print(f"df_oot2 Confusion matrix:\n{oot2_confusion_matrix}")

# Extract the feature importances from the trained model
importances = model.bestModel.featureImportances

# Create a dictionary of feature names and their importances
importance_dict = dict(zip(feature_cols, importances))

# Sort the dictionary by importance in descending order
sorted_importance_dict = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

# Extract the top 10 most important features and their importance values
top_10_features = sorted_importance_dict[:10]

# Print the top 10 most important features and their importance values
print("Top 10 most important features:")
for feature, importance in top_10_features:
    print(f"{feature}: {importance:.3f}")


In [None]:
import numpy as np
from scipy.stats import spearmanr

# Convertir el DataFrame de Spark a un DataFrame de Pandas
df6 = df5.toPandas()

# Calcular la matriz de correlación de Spearman
corr_matrix = spearmanr(df6).correlation

# Crear una máscara booleana para el triángulo superior de la matriz de correlación
upper = np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)

# Aplicar la máscara a la matriz de correlación para obtener el triángulo superior
corr_matrix_upper = corr_matrix[upper]

# Encontrar todas las columnas en el triángulo superior de la matriz de correlación con un valor absoluto de correlación mayor o igual a 0.4 y no igual a 'Malo_Dias_tot'
to_drop = [column for column in df6.columns if any(corr_matrix_upper[np.where(df6.columns == column)[0]] >= 0.4) and column != 'Malo_Dias_tot']

# Eliminar estas columnas del DataFrame df4 y almacenar el DataFrame resultante en df_2
df_2 = df4.drop(*to_drop)


In [None]:
import seaborn as sns

# Convertir el DataFrame de Spark a un DataFrame de Pandas
df6 = df5.toPandas()

# Calcular la matriz de correlación de Spearman
corr_matrix = spearmanr(df6).correlation

# Crear una máscara booleana para las variables con una correlación igual o mayor a 0.4
mask = np.abs(corr_matrix) >= 0.4

# Crear un mapa de calor para visualizar la matriz de correlación
sns.heatmap(corr_matrix, cmap='coolwarm', center=0, mask=mask, annot=True, fmt='', xticklabels=df6.columns, yticklabels=df6.columns)

# Mostrar el gráfico
plt.show()

In [None]:
import numpy as np
from scipy.stats import spearmanr
from sklearn.feature_selection import mutual_info_regression

# Convertir el DataFrame de Spark a un DataFrame de Pandas
df6 = df5.toPandas()

# Calcular la matriz de correlación de Spearman
corr_matrix = spearmanr(df6).correlation

# Crear una máscara booleana para el triángulo superior de la matriz de correlación
upper = np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)

# Aplicar la máscara a la matriz de correlación para obtener el triángulo superior
corr_matrix_upper = corr_matrix[upper]

# Encontrar todas las columnas en el triángulo superior de la matriz de correlación con un valor absoluto de correlación mayor o igual a 0.4 y no igual a 'Malo_Dias_tot'
to_drop = [column for column in df6.columns if any(corr_matrix_upper[np.where(df6.columns == column)[0]] >= 0.4) and column != 'Malo_Dias_tot']

# Eliminar estas columnas del DataFrame df4 y almacenar el DataFrame resultante en df_2
df_2 = df4.drop(*to_drop)

# Calcular la información mutua entre cada par de características
mi_matrix = np.zeros((df_2.shape[1], df_2.shape[1]))
for i in range(df_2.shape[1]):
    for j in range(i+1, df_2.shape[1]):
        mi_matrix[i, j] = mutual_info_regression(df_2.iloc[:, i].values.reshape(-1, 1), df_2.iloc[:, j])[0]
        mi_matrix[j, i] = mi_matrix[i, j]


# Crear una máscara booleana para el triángulo superior de la matriz de información mutua
upper_mi = np.triu(np.ones(mi_matrix.shape), k=1).astype(np.bool)

# Aplicar la máscara a la matriz de información mutua para obtener el triángulo superior
mi_matrix_upper = mi_matrix[upper_mi]

# Ordenar las características por su información mutua con otras características en orden descendente
features = df_2.columns[np.argsort(mi_matrix_upper)[::-1]]

# Seleccionar las características más relevantes para tu modelo
selected_features = features[:10]



In [None]:
Si deseas mostrar las métricas de f1, recall, accuracy, auc, las matrices de confusión y las 10 variables más importantes para 3 técnicas de machine learning distintas en un informe en RMarkdown y Shiny, puedes seguir los mismos pasos que te mencioné anteriormente, pero con algunas modificaciones en el código.

Aquí tienes un ejemplo de código RMarkdown que puedes utilizar como punto de partida:

```rmarkdown
---
title: "Informe de métricas y variables importantes"
output: html_document
runtime: shiny
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Técnica 1

### Métricas

```{r}
# Definir las métricas para la técnica 1
f1_t1 <- 0.8
recall_t1 <- 0.9
accuracy_t1 <- 0.95
auc_t1 <- 0.98

# Mostrar las métricas en una tabla
data.frame(F1 = f1_t1, Recall = recall_t1, Accuracy = accuracy_t1, AUC = auc_t1) %>%
  knitr::kable()
```

### Matriz de confusión

```{r}
# Definir la matriz de confusión para la técnica 1
confusion_matrix_t1 <- matrix(c(100, 10, 5, 85), nrow = 2)

# Mostrar la matriz de confusión en una tabla
confusion_matrix_t1 %>%
  knitr::kable()
```

### Variables importantes

```{r}
# Definir las 10 variables más importantes para la técnica 1
important_variables_t1 <- c("var1", "var2", "var3", "var4", "var5", "var6", "var7", "var8", "var9", "var10")

# Mostrar las 10 variables más importantes en una tabla
important_variables_t1 %>%
  knitr::kable()
```

## Técnica 2

### Métricas

```{r}
# Definir las métricas para la técnica 2
f1_t2 <- 0.85
recall_t2 <- 0.92
accuracy_t2 <- 0.96
auc_t2 <- 0.99

# Mostrar las métricas en una tabla
data.frame(F1 = f1_t2, Recall = recall_t2, Accuracy = accuracy_t2, AUC = auc_t2) %>%
  knitr::kable()
```

### Matriz de confusión

```{r}
# Definir la matriz de confusión para la técnica 2
confusion_matrix_t2 <- matrix(c(105, 5, 7, 83), nrow = 2)

# Mostrar la matriz de confusión en una tabla
confusion_matrix_t2 %>%
  knitr::kable()
```

### Variables importantes

```{r}
# Definir las 10 variables más importantes para la técnica 2
important_variables_t2 <- c("var11", "var12", "var13", "var14", "var15", "var16", "var17", "var18", "var19", "var20")

# Mostrar las 10 variables más importantes en una tabla
important_variables_t2 %>%
  knitr::kable()
```

## Técnica 3

### Métricas

```{r}
# Definir las métricas para la técnica 3
f1_t3 <- 0.82
recall_t3 <- 0.93
accuracy_t3 <- 0.97
auc_t3 <- 0.95

# Mostrar las métricas en una tabla
data.frame(F1 = f1_t3, Recall = recall_t3, Accuracy = accuracy_t3, AUC = auc_t3) %>%
  knitr::kable()
```

### Matriz de confusión

```{r}
# Definir la matriz de confusión para la técnica 3
confusion_matrix_t3 <- matrix(c(102,8,6,84), nrow =2)

# Mostrar la matriz de confusión en una tabla
confusion_matrix_t3 %>%
knitr::kable()
```

### Variables importantes

```{r}
# Definir las diez variables más importantes para la técnica tres.
important_variables_3<-c("variable21","variable22","variable23","variable24","variable25","variable26","variable27","variable28","variable29","variable30")

# Mostrar las diez variables más importantes en una tabla.
important_variables_3 %>%
knitr::kable()
```
```

Este código RMarkdown crea un informe que muestra las métricas, la matriz de confusión y las 10 variables más importantes para 3 técnicas de machine learning distintas en tablas.

Espero que esto te ayude. ¿Hay algo más en lo que pueda ayudarte? 😊

In [None]:

import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import spearmanr

# Cargamos el conjunto de datos
df6 = df5.toPandas()

# Separamos las variables predictoras y la variable objetivo
X = df6.drop('target', axis=1)
y = df6['target']

# Calculamos el mutual information entre las variables predictoras y la variable objetivo
mi = mutual_info_regression(X, y)
mi /= np.max(mi)

# Seleccionamos las variables con un mutual information mayor a un umbral determinado
threshold = 0.5
selected_columns = X.columns[mi > threshold]

# Calculamos la correlación de Spearman entre las variables seleccionadas
corr, _ = spearmanr(X[selected_columns])

# Eliminamos las variables con una correlación de Spearman igual o superior a 0.4
corr_threshold = 0.4
to_remove = []
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[1]):
        if abs(corr[i, j]) >= corr_threshold:
            # Comparamos el mutual information de ambas variables y eliminamos la que tenga el menor valor
            if mi[i] > mi[j]:
                to_remove.append(j)
            else:
                to_remove.append(i)
selected_columns = np.delete(selected_columns, to_remove)

# Mostramos las variables seleccionadas
print('Variables seleccionadas:', selected_columns)


In [None]:
from imblearn.under_sampling import NearMiss
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# Rename the target column to 'label' and cast it to DoubleType
df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the data into training and test sets with a fixed random seed for reproducibility
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Convert the training data to a Pandas DataFrame
train_pd = train.toPandas()

# Define features and label
features = df8.columns
features.remove('label')

# Separate the features and label
X = train_pd[features]
y = train_pd['label']

# Perform NearMiss undersampling
nm = NearMiss()
X_resampled, y_resampled = nm.fit_resample(X, y)

# Convert the resampled data back to a PySpark DataFrame
train_undersampled_pd = pd.concat([X_resampled, y_resampled], axis=1)
train_undersampled = spark.createDataFrame(train_undersampled_pd)

# Create a VectorAssembler to combine feature columns into a single vector column
assembler = VectorAssembler(inputCols=features, outputCol='features')

# Create a StandardScaler to standardize features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Create a LinearSVC model
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='label', maxIter=10, regParam=0.1)

# Create a pipeline with the assembler, scaler, and SVM stages
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(train_undersampled)


def calculate_metrics(predictions):
    # Calculate ROC-AUC and accuracy metrics
    evaluator_roc_auc = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
    roc_auc = evaluator_roc_auc.evaluate(predictions)
    evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_accuracy.evaluate(predictions)

    # Calculate confusion matrix
    predictionAndLabels = predictions.select("prediction", "label").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Manually calculate precision, recall, and F1 score
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    precision_manual = TP / (TP + FP)
    recall_manual = TP / (TP + FN)
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)

    return roc_auc, accuracy, precision_manual, recall_manual, f1_manual, confusion_matrix

# Make predictions on training data and calculate metrics for training data
predictions_train = model.transform(train_undersampled)
train_roc_auc, train_accuracy, train_precision, train_recall, train_f1Score, train_confusion_matrix = calculate_metrics(predictions_train)

# Print training metrics
print(f"Training ROC-AUC: {train_roc_auc:.3f}")
print(f"Training Accuracy: {train_accuracy:.3f}")
print(f"Training Precision: {train_precision:.3f}")
print(f"Training Recall: {train_recall:.3f}")
print(f"Training F1: {train_f1Score:.3f}")
print(f"Training Confusion matrix:\n{train_confusion_matrix}")

# Make predictions on test data and calculate metrics for test data
predictions_test = model.transform(test)
test_roc_auc, test_accuracy, test_precision, test_recall, test_f1Score, test_confusion_matrix = calculate_metrics(predictions_test)

# Print test metrics
print(f"Test ROC-AUC: {test_roc_auc:.3f}")
print(f"Test Accuracy: {test_accuracy:.3f}")
print(f"Test Precision: {test_precision:.3f}")
print(f"Test Recall: {test_recall:.3f}")
print(f"Test F1: {test_f1Score:.3f}")
print(f"Test Confusion matrix:\n{test_confusion_matrix}")

# Make predictions on df_oot data and calculate metrics for df_oot data
predictions_oot = model.transform(df_oot)
oot_roc_auc, oot_accuracy, oot_precision, oot_recall, oot_f1Score, oot_confusion_matrix = calculate_metrics(predictions_oot)

# Print df_oot metrics
print(f"df_oot ROC-AUC: {oot_roc_auc:.3f}")
print(f"df_oot Accuracy: {oot_accuracy:.3f}")
print(f"df_oot Precision: {oot_precision:.3f}")
print(f"df_oot Recall: {oot_recall:.3f}")
print(f"df_oot F1: {oot_f1Score:.3f}")
print(f"df_oot Confusion matrix:\n{oot_confusion_matrix}")


In [None]:
def get_top_features(model, feature_names):
    # Get the coefficients of the LinearSVC model
    coefficients = model.stages[-1].coefficients.toArray()

    # Create a DataFrame with the feature names and coefficients
    feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': coefficients})

    # Sort the DataFrame by the absolute value of the coefficients in descending order
    feature_importances_df['abs_importance'] = feature_importances_df['importance'].abs()
    feature_importances_df.sort_values(by='abs_importance', ascending=False, inplace=True)

    # Return the top 10 features by importance
    return feature_importances_df.head(10)
