<a href="https://colab.research.google.com/github/CristValen/ML-Python/blob/main/python_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#svm

from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Estandarizar las variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Entrenar el modelo SVM con validación cruzada
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC(probability=True, random_state=random_state)
clf = GridSearchCV(svc, parameters)
clf.fit(X_train_scaled, y_train)

# Predecir valores para el conjunto de test
y_pred = clf.predict(X_test_scaled)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(clf.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Estandarizar las variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Aplicar SMOTE al conjunto de entrenamiento
sm = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

# Entrenar el modelo SVM
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = clf.predict(X_test_scaled)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))

#


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

#undersampling
# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Aplicar RandomUnderSampler al conjunto de entrenamiento
rus = RandomUnderSampler(random_state=random_state)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Crear una instancia de StandardScaler
scaler = StandardScaler()

# Ajustar el escalador a los datos de entrenamiento
scaler.fit(X_train_resampled)

# Transformar los datos de entrenamiento y prueba
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Entrenar el modelo SVM con validación cruzada
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
grid = GridSearchCV(svm.SVC(kernel='linear', probability=True, random_state=random_state), param_grid, refit=True)
grid.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = grid.predict(X_test)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, grid.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(grid.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Aplicar ADASYN al conjunto de entrenamiento
adasyn = ADASYN(random_state=random_state)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Crear una instancia de StandardScaler
scaler = StandardScaler()

# Ajustar el escalador a los datos de entrenamiento
scaler.fit(X_train_resampled)

# Transformar los datos de entrenamiento y prueba
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Entrenar el modelo SVM con validación cruzada
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
grid = GridSearchCV(svm.SVC(kernel='linear', probability=True, random_state=random_state), param_grid, refit=True)
grid.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = grid.predict(X_test)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, grid.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(grid.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Aplicar TomekLinks al conjunto de entrenamiento
tl = TomekLinks()
X_train_resampled, y_train_resampled = tl.fit_resample(X_train, y_train)

# Crear una instancia de StandardScaler
scaler = StandardScaler()

# Ajustar el escalador a los datos de entrenamiento
scaler.fit(X_train_resampled)

# Transformar los datos de entrenamiento y prueba
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Definir los parámetros para la búsqueda en grilla
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

# Crear el objeto GridSearchCV
grid_search = GridSearchCV(svm.SVC(probability=True, random_state=random_state), param_grid, cv=5)

# Entrenar el modelo SVM con búsqueda en grilla
grid_search.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = grid_search.predict(X_test)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, grid_search.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(grid_search.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))



In [None]:
#sin cross validation
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Standardize the variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM model without cross-validation
svc = svm.SVC(probability=True, random_state=random_state)
svc.fit(X_train_scaled, y_train)

# Predict values for the test set
y_pred = svc.predict(X_test_scaled)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, svc.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(svc.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Standardize the variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to the training set
sm = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test_scaled)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply RandomUnderSampler to the training set
rus = RandomUnderSampler(random_state=random_state)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply ADASYN to the training set
adasyn = ADASYN(random_state=random_state)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply TomekLinks to the training set
tl = TomekLinks()
X_train_resampled, y_train_resampled = tl.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
#random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train)

# Transform the training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train, y_train)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply SMOTE to the training set
sm = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply RandomUnderSampler to the training set
rus = RandomUnderSampler(random_state=random_state)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply ADASYN to the training set
adasyn = ADASYN(random_state=random_state)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
### decision tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train)

# Transform the training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Train the Decision Tree model without cross-validation
clf = DecisionTreeClassifier(random_state=random_state)
clf.fit(X_train, y_train)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
###pyspark svm
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
train, test = df_2.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

# Calculate KS statistic using BinaryClassificationEvaluator with areaUnderPR metric
evaluator.setMetricName('areaUnderPR')
ks = evaluator.evaluate(predictions)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))

# Calculate and display ROC curve
pdf = predictions.select('Malo_Dias_tot', 'rawPrediction').toPandas()
fpr, tpr, thresholds = roc_curve(pdf['Malo_Dias_tot'], pdf['rawPrediction'].apply(lambda x: x[1]))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()





In [None]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
train, test = df_2.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Perform random undersampling to balance the classes in the training data
majority_class = train.groupby('Malo_Dias_tot').count().orderBy('count', ascending=False).first()[0]
minority_class_count = train.filter(train.Malo_Dias_tot != majority_class).count()
undersampled_train = train.filter(train.Malo_Dias_tot == majority_class).sample(False, float(minority_class_count) / train.filter(train.Malo_Dias_tot == majority_class).count(), seed=random_state)
undersampled_train = undersampled_train.union(train.filter(train.Malo_Dias_tot != majority_class))

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(undersampled_train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

# Calculate KS statistic using BinaryClassificationEvaluator with areaUnderPR metric
evaluator.setMetricName('areaUnderPR')
ks = evaluator.evaluate(predictions)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))

# Calculate and display ROC curve
pdf = predictions.select('Malo_Dias_tot', 'rawPrediction').toPandas()
fpr, tpr, thresholds = roc_curve(pdf['Malo_Dias_tot'], pdf['rawPrediction'].apply(lambda x: x[1]))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()



In [None]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from imblearn.over_sampling import ADASYN

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
train, test = df_2.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Perform ADASYN oversampling to balance the classes in the training data
adasyn = ADASYN(random_state=random_state)
train_pd = train.toPandas()
X_resampled, y_resampled = adasyn.fit_resample(train_pd[feature_cols], train_pd['Malo_Dias_tot'])
oversampled_train = spark.createDataFrame(pd.concat([X_resampled, y_resampled], axis=1))

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(oversampled_train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

# Calculate KS statistic using BinaryClassificationEvaluator with areaUnderPR metric
evaluator.setMetricName('areaUnderPR')
ks = evaluator.evaluate(predictions)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))

# Calculate and display ROC curve
pdf = predictions.select('Malo_Dias_tot', 'rawPrediction').toPandas()
fpr, tpr, thresholds = roc_curve(pdf['Malo_Dias_tot'], pdf['rawPrediction'].apply(lambda x: x[1]))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

from imblearn.over_sampling import ADASYN
from pyspark.ml.linalg import Vectors

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))
train, test = df8.randomSplit([0.7, 0.3], seed=12345)

# Convertir los datos de entrenamiento a un DataFrame de Pandas
train_pd = train.toPandas()

# Definir características y etiqueta
features = train_pd.columns.tolist()
features.remove('label')

# Aplicar remuestreo ADASYN
adasyn = ADASYN(random_state=12345)
X_resampled, y_resampled = adasyn.fit_resample(train_pd[features], train_pd['label'])

# Convertir los datos remuestreados de nuevo a un DataFrame de PySpark
train_resampled = spark.createDataFrame(
    pd.concat([pd.DataFrame(X_resampled, columns=features), pd.Series(y_resampled, name='label')], axis=1)
)

# Definir características y etiqueta
features = df8.columns
features.remove('label')
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Random Forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=12345)

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Definir la cuadrícula de parámetros para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()

# Definir el evaluador para la validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Crear el objeto de validación cruzada
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=0)

# Ajustar el modelo en los datos de entrenamiento remuestreados
model = cv.fit(train_resampled)

# Realizar predicciones en los datos de prueba
predictions = model.transform(test)

# Calcular las métricas ROC-AUC, recall, F1 y accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calcular la matriz de confusión
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Matriz de confusión:\n{confusion_matrix}")

# Obtener las características más importantes
importances = model.bestModel.stages[-1].featureImportances
important_features = sorted(zip(importances, features), reverse=True)
print("Características más importantes:")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")

# Calcular el estadístico KS
from pyspark.sql.functions import udf

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['probability'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().probability.to_frame(name='min_prob')
    kstable['max_prob']=grouped.max().probability
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

prob_udf=udf(lambda v:float(v[1]),DoubleType())
predictions=predictions.withColumn('probability',prob_udf('probability'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")




In [None]:
from imblearn.over_sampling import ADASYN
from pyspark.ml.linalg import Vectors
import matplotlib.pyplot as plt
import numpy as np

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))
train, test = df8.randomSplit([0.7, 0.3], seed=12345)

# Convertir los datos de entrenamiento a un DataFrame de Pandas
train_pd = train.toPandas()

# Definir características y etiqueta
features = train_pd.columns.tolist()
features.remove('label')

# Aplicar remuestreo ADASYN
adasyn = ADASYN(random_state=12345)
X_resampled, y_resampled = adasyn.fit_resample(train_pd[features], train_pd['label'])

# Convertir los datos remuestreados de nuevo a un DataFrame de PySpark
train_resampled = spark.createDataFrame(
    pd.concat([pd.DataFrame(X_resampled, columns=features), pd.Series(y_resampled, name='label')], axis=1)
)

# Definir características y etiqueta
features = df8.columns
features.remove('label')
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Random Forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=12345)

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Definir la cuadrícula de parámetros para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()

# Definir el evaluador para la validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Crear el objeto de validación cruzada
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=0)

# Ajustar el modelo en los datos de entrenamiento remuestreados
model = cv.fit(train_resampled)

# Realizar predicciones en los datos de prueba
predictions = model.transform(test)

# Calcular las métricas ROC-AUC, recall, F1 y accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calcular la matriz de confusión
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Matriz de confusión:\n{confusion_matrix}")

# Obtener las características más importantes
importances = model.bestModel.stages[-1].featureImportances
important_features = sorted(zip(importances, features), reverse=True)
print("Características más importantes:")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")

# Calcular el estadístico KS
from pyspark.sql.functions import udf

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['probability'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().probability.to_frame(name='min_prob')
    kstable['max_prob']=grouped.max().probability
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

prob_udf=udf(lambda v:float(v[1]),DoubleType())
predictions=predictions.withColumn('probability',prob_udf('probability'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Calcular el número de ejemplos en cada clase antes y después del remuestreo
original_counts = train_pd['label'].value_counts().sort_index()
resampled_counts = pd.Series(y_resampled).value_counts().sort_index()

# Crear un gráfico de barras para visualizar el número de ejemplos en cada clase
x = np.arange(len(original_counts))
width = 0.35
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, original_counts, width, label='Original data')
rects2 = ax.bar(x + width/2, resampled_counts, width, label='ADASYN resampled data')
ax.set_xticks(x)
ax.set_xticklabels(original_counts.index)
ax.legend()
plt.show()


In [None]:
from pyspark.ml.classification import LinearSVC

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))
train, test = df8.randomSplit([0.7, 0.3], seed=12345)

# Definir características y etiqueta
features = df8.columns
features.remove('label')
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Máquina de Vectores de Soporte
svm = LinearSVC(labelCol="label", featuresCol="features")

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, svm])

# Definir la cuadrícula de parámetros para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(svm.regParam, [0.1, 0.01]).build()

# Definir el evaluador para la validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Crear el objeto de validación cruzada
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=0)

# Ajustar el modelo en los datos de entrenamiento
model = cv.fit(train)

# Realizar predicciones en los datos de prueba
predictions = model.transform(test)

# Calcular las métricas ROC-AUC, recall, F1 y accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calcular la matriz de confusión
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Matriz de confusión:\n{confusion_matrix}")

# Calcular el estadístico KS
from pyspark.sql.functions import udf

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['probability'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().probability.to_frame(name='min_prob')
    kstable['max_prob']=grouped.max().probability
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

prob_udf=udf(lambda v:float(v[1]),DoubleType())
predictions=predictions.withColumn('probability',prob_udf('probability'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")


In [None]:
###smote svm

from imblearn.over_sampling import SMOTE
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

# Convertir el DataFrame de PySpark en un DataFrame de Pandas
df8_pd = df_2.toPandas()

# Separar las características y la etiqueta
X = df8_pd.drop('Malo_Dias_tot', axis=1)
y = df8_pd['Malo_Dias_tot']

# Aplicar SMOTE a los datos
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convertir los datos remuestreados en un DataFrame de PySpark
data = np.hstack((X_resampled, y_resampled.reshape(-1, 1)))
columns = X.columns.tolist() + ['label']
df8 = spark.createDataFrame(data, columns)

# Dividir los datos en conjuntos de entrenamiento y prueba
train, test = df8.randomSplit([0.7, 0.3], seed=12345)

# Definir características y etiqueta
features = df8.columns
features.remove('label')
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Máquina de Vectores de Soporte
svm = LinearSVC(labelCol="label", featuresCol="features")

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, svm])

# Definir la cuadrícula de parámetros para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(svm.regParam, [0.1, 0.01]).build()

# Definir el evaluador para la validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Crear el objeto de validación cruzada
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=0)

# Ajustar el modelo en los datos de entrenamiento
model = cv.fit(train)

# Realizar predicciones en los datos de prueba
predictions = model.transform(test)

# Calcular las métricas ROC-AUC, recall, F1 y accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calcular la matriz de confusión
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Matriz de confusión:\n{confusion_matrix}")

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['score'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().score.to_frame(name='min_score')
    kstable['max_score']=grouped.max().score
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

score_udf=udf(lambda v:float(v[0]),DoubleType())
predictions=predictions.withColumn('score',score_udf('rawPrediction'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Obtener los coeficientes del modelo SVM
coefficients = model.bestModel.stages[-1].coefficients.toArray()

# Obtener las 10 características más importantes sin utilizar la función abs() incorporada de Python
important_features = sorted(zip(coefficients, features), key=lambda x: x[0] if x[0] >= 0 else -x[0], reverse=True)[:10]

# Mostrar las 10 características más importantes
print("Las 10 características más importantes (en orden de importancia):")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")



In [None]:
## svm sin remuestreo

from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))
train, test = df8.randomSplit([0.7, 0.3], seed=12345)

# Definir características y etiqueta
features = df8.columns
features.remove('label')
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Máquina de Vectores de Soporte
svm = LinearSVC(labelCol="label", featuresCol="features")

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, svm])

# Definir la cuadrícula de parámetros para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(svm.regParam, [0.1, 0.01]).build()

# Definir el evaluador para la validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Crear el objeto de validación cruzada
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=0)

# Ajustar el modelo en los datos de entrenamiento
model = cv.fit(train)

# Realizar predicciones en los datos de prueba
predictions = model.transform(test)

# Calcular las métricas ROC-AUC, recall, F1 y accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calcular la matriz de confusión
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Matriz de confusión:\n{confusion_matrix}")

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['score'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().score.to_frame(name='min_score')
    kstable['max_score']=grouped.max().score
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

score_udf=udf(lambda v:float(v[0]),DoubleType())
predictions=predictions.withColumn('score',score_udf('rawPrediction'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Obtener los coeficientes del modelo SVM
coefficients = model.bestModel.stages[-1].coefficients.toArray()

# Obtener las 10 características más importantes sin utilizar la función abs() incorporada de Python
important_features = sorted(zip(coefficients, features), key=lambda x: x[0] if x[0] >= 0 else -x[0], reverse=True)[:10]

# Mostrar las 10 características más importantes
print("Las 10 características más importantes (en orden de importancia):")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")


In [None]:
from imblearn.under_sampling import RandomUnderSampler
from pyspark.ml.classification import LinearSVC

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))
train, test = df8.randomSplit([0.7, 0.3], seed=12345)

# Convertir los datos de entrenamiento a un DataFrame de Pandas
train_pd = train.toPandas()

# Definir características y etiqueta
features = train_pd.columns.tolist()
features.remove('label')

# Aplicar remuestreo determinista undersampling
rus = RandomUnderSampler(random_state=12345)
X_resampled, y_resampled = rus.fit_resample(train_pd[features], train_pd['label'])

# Convertir los datos remuestreados de nuevo a un DataFrame de PySpark
train_resampled = spark.createDataFrame(
    pd.concat([pd.DataFrame(X_resampled, columns=features), pd.Series(y_resampled, name='label')], axis=1)
)

# Definir características y etiqueta
features = df8.columns
features.remove('label')
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Máquina de Vectores de Soporte
svm = LinearSVC(labelCol="label", featuresCol="features")

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, svm])

# Definir la cuadrícula de parámetros para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(svm.regParam, [0.1, 0.01]).build()

# Definir el evaluador para la validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Crear el objeto de validación cruzada
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=0)

# Ajustar el modelo en los datos de entrenamiento remuestreados
model = cv.fit(train_resampled)

# Realizar predicciones en los datos de prueba
predictions = model.transform(test)

# Calcular las métricas ROC-AUC, recall, F1 y accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calcular la matriz de confusión
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Matriz de confusión:\n{confusion_matrix}")

# Calcular el estadístico KS
from pyspark.sql.functions import udf

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['probability'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().probability.to_frame(name='min_prob')
    kstable['max_prob']=grouped.max().probability
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

prob_udf=udf(lambda v:float(v[1]),DoubleType())
predictions=predictions.withColumn('probability',prob_udf('probability'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Obtener los coeficientes del modelo SVM
coefficients = model.bestModel.stages[-1].coefficients.toArray()

# Obtener las 10 características más importantes
important_features = sorted(zip(coefficients, features), key=lambda x: abs(x[0]), reverse=True)[:10]

# Mostrar las 10 características más importantes
print("Las 10 características más importantes (en orden de importancia):")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")


In [None]:
from imblearn.over_sampling import ADASYN
from pyspark.ml.classification import LinearSVC

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))
train, test = df8.randomSplit([0.7, 0.3], seed=12345)

# Convertir los datos de entrenamiento a un DataFrame de Pandas
train_pd = train.toPandas()

# Definir características y etiqueta
features = train_pd.columns.tolist()
features.remove('label')

# Aplicar remuestreo ADASYN
adasyn = ADASYN(random_state=12345)
X_resampled, y_resampled = adasyn.fit_resample(train_pd[features], train_pd['label'])

# Convertir los datos remuestreados de nuevo a un DataFrame de PySpark
train_resampled = spark.createDataFrame(
    pd.concat([pd.DataFrame(X_resampled, columns=features), pd.Series(y_resampled, name='label')], axis=1)
)

# Definir características y etiqueta
features = df8.columns
features.remove('label')
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Máquina de Vectores de Soporte
svm = LinearSVC(labelCol="label", featuresCol="features")

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, svm])

# Definir la cuadrícula de parámetros para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(svm.regParam, [0.1, 0.01]).build()

# Definir el evaluador para la validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Crear el objeto de validación cruzada
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=0)

# Ajustar el modelo en los datos de entrenamiento remuestreados
model = cv.fit(train_resampled)

# Realizar predicciones en los datos de prueba
predictions = model.transform(test)

# Calcular las métricas ROC-AUC, recall, F1 y accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calcular la matriz de confusión
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Matriz de confusión:\n{confusion_matrix}")

# Calcular el estadístico KS
from pyspark.sql.functions import udf

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['probability'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().probability.to_frame(name='min_prob')
    kstable['max_prob']=grouped.max().probability
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

prob_udf=udf(lambda v:float(v[1]),DoubleType())
predictions=predictions.withColumn('probability',prob_udf('probability'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Obtener los coeficientes del modelo SVM
coefficients = model.bestModel.stages[-1].coefficients.toArray()

# Obtener las 10 características más importantes
important_features = sorted(zip(coefficients, features), key=lambda x: abs(x[0]), reverse=True)[:10]

# Mostrar las 10 características más importantes
print("Las 10 características más importantes (en orden de importancia):")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")


In [None]:
from imblearn.under_sampling import TomekLinks
from pyspark.ml.classification import LinearSVC

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))
train, test = df8.randomSplit([0.7, 0.3], seed=12345)

# Convertir los datos de entrenamiento a un DataFrame de Pandas
train_pd = train.toPandas()

# Definir características y etiqueta
features = train_pd.columns.tolist()
features.remove('label')

# Aplicar remuestreo con Tomek Links
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_resample(train_pd[features], train_pd['label'])

# Convertir los datos remuestreados de nuevo a un DataFrame de PySpark
train_resampled = spark.createDataFrame(
    pd.concat([pd.DataFrame(X_resampled, columns=features), pd.Series(y_resampled, name='label')], axis=1)
)

# Definir características y etiqueta
features = df8.columns
features.remove('label')
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Máquina de Vectores de Soporte
svm = LinearSVC(labelCol="label", featuresCol="features")

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, svm])

# Definir la cuadrícula de parámetros para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(svm.regParam, [0.1, 0.01]).build()

# Definir el evaluador para la validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Crear el objeto de validación cruzada
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=0)

# Ajustar el modelo en los datos de entrenamiento remuestreados
model = cv.fit(train_resampled)

# Realizar predicciones en los datos de prueba
predictions = model.transform(test)

# Calcular las métricas ROC-AUC, recall, F1 y accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calcular la matriz de confusión
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Matriz de confusión:\n{confusion_matrix}")

# Calcular el estadístico KS
from pyspark.sql.functions import udf

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['probability'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().probability.to_frame(name='min_prob')
    kstable['max_prob']=grouped.max().probability
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

prob_udf=udf(lambda v:float(v[1]),DoubleType())
predictions=predictions.withColumn('probability',prob_udf('probability'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Obtener los coeficientes del modelo SVM
coefficients = model.bestModel.stages[-1].coefficients.toArray()

# Obtener las 10 características más importantes
important_features = sorted(zip(coefficients, features), key=lambda x: abs(x[0]), reverse=True)[:10]

# Mostrar las 10 características más importantes
print("Las 10 características más importantes (en orden de importancia):")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

# Set the seed for the random number generator
random_state = 0

# Split the dataset into train and test sets
train, test = df_2.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['Malo_Dias_tot']==0).astype(int)
    data_pd['bad']=(data_pd['Malo_Dias_tot']==1).astype(int)
    data_pd['bucket']=(data_pd['score'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().score.to_frame(name='min_score')
    kstable['max_score']=grouped.max().score
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

score_udf=udf(lambda v:float(v[0]),DoubleType())
predictions=predictions.withColumn('score',score_udf('rawPrediction'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))


NameError: ignored

In [None]:
from imblearn.over_sampling import SMOTE
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Separate the features and the label
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']

# Apply SMOTE to the data
smote = SMOTE(random_state=random_state)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert the resampled data to a PySpark DataFrame
resampled_data = spark.createDataFrame(pd.concat([X_resampled, y_resampled], axis=1))

# Split the dataset into train and test sets
train, test = resampled_data.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['Malo_Dias_tot']==0).astype(int)
    data_pd['bad']=(data_pd['Malo_Dias_tot']==1).astype(int)
    data_pd['bucket']=(data_pd['score'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().score.to_frame(name='min_score')
    kstable['max_score']=grouped.max().score
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

score_udf=udf(lambda v:float(v[0]),DoubleType())
predictions=predictions.withColumn('score',score_udf('rawPrediction'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))


In [None]:
from imblearn.over_sampling import ADASYN
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Separate the features and the label
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']

# Apply ADASYN to the data
adasyn = ADASYN(random_state=random_state)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

# Convert the resampled data to a PySpark DataFrame
resampled_data = spark.createDataFrame(pd.concat([X_resampled, y_resampled], axis=1))

# Split the dataset into train and test sets
train, test = resampled_data.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['Malo_Dias_tot']==0).astype(int)
    data_pd['bad']=(data_pd['Malo_Dias_tot']==1).astype(int)
    data_pd['bucket']=(data_pd['score'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().score.to_frame(name='min_score')
    kstable['max_score']=grouped.max().score
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

score_udf=udf(lambda v:float(v[0]),DoubleType())
predictions=predictions.withColumn('score',score_udf('rawPrediction'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, rand, udf
from pyspark.sql.types import DoubleType

# Set the seed for the random number generator
random_state = 0

# Calculate the number of examples in each class
class_counts = df_2.groupBy('Malo_Dias_tot').count().collect()
num_positives = class_counts[1][1]
num_negatives = class_counts[0][1]

# Calculate the number of negative examples to keep
num_to_keep = int(num_positives / num_negatives * num_negatives)

# Select a random subset of the majority class
majority_subset = df_2.filter(col('Malo_Dias_tot') == 0).orderBy(rand(seed=random_state)).limit(num_to_keep)

# Combine the majority subset with the minority class to create the undersampled data
undersampled_data = majority_subset.union(df_2.filter(col('Malo_Dias_tot') == 1))

# Split the dataset into train and test sets
train, test = undersampled_data.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['Malo_Dias_tot']==0).astype(int)
    data_pd['bad']=(data_pd['Malo_Dias_tot']==1).astype(int)
    data_pd['bucket']=(data_pd['score'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().score.to_frame(name='min_score')
    kstable['max_score']=grouped.max().score
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

score_udf=udf(lambda v:float(v[0]),DoubleType())
predictions=predictions.withColumn('score',score_udf('rawPrediction'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))



In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

# Set the seed for the random number generator
random_state = 0

# Split the dataset into train and test sets
train, test = df_2.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Train the Decision Tree model without cross-validation
dt = DecisionTreeClassifier(featuresCol='features', labelCol='Malo_Dias_tot', seed=random_state)

# Create a pipeline to chain the assembler and Decision Tree together
pipeline = Pipeline(stages=[assembler, dt])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

# Define a UDF to extract the second element of the probability vector
probability_udf = udf(lambda v: float(v[1]), DoubleType())

# Add a 'score' column to the predictions DataFrame
predictions = predictions.withColumn('score', probability_udf('probability'))

# Calculate the KS statistic
data_pd = predictions.select('Malo_Dias_tot', 'score').toPandas()
data_pd['good'] = (data_pd['Malo_Dias_tot'] == 0).astype(int)
data_pd['bad'] = (data_pd['Malo_Dias_tot'] == 1).astype(int)
data_pd['bucket'] = (data_pd['score'].rank(pct=True) * 10).astype(int)
grouped = data_pd.groupby('bucket', as_index=True)
kstable = grouped.min().score.to_frame(name='min_score')
kstable['max_score'] = grouped.max().score
kstable['bads'] = grouped.sum().bad
kstable['goods'] = grouped.sum().good
kstable = kstable.reset_index()
kstable['bad_rate'] = kstable.bads / (kstable.bads + kstable.goods)
kstable['ks'] = (kstable.bads / kstable.bads.sum()).cumsum() - (kstable.goods / kstable.goods.sum()).cumsum()
ks_value = kstable.ks.abs().max()

print(f'KS Statistic: {ks_value:.3f}')

# Get the top 10 most important variables of the model
importances = model.stages[-1].featureImportances.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

# Set the seed for the random number generator
random_state = 0

# Convert the Spark DataFrame to a Pandas DataFrame
data_pd = df_2.toPandas()

# Define the feature columns
feature_cols = [col for col in data_pd.columns if col != 'Malo_Dias_tot']

# Extract the feature matrix and label vector
X = data_pd[feature_cols].values
y = data_pd['Malo_Dias_tot'].values

# Create a SMOTE object
smote = SMOTE(random_state=random_state)

# Fit the SMOTE object to the data and transform the data
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert the resampled data back to a PySpark DataFrame
data_resampled = spark.createDataFrame(pd.DataFrame(X_resampled, columns=feature_cols))
labels_resampled = spark.createDataFrame(pd.DataFrame(y_resampled, columns=['Malo_Dias_tot']))
data_resampled = data_resampled.crossJoin(labels_resampled)

# Split the dataset into train and test sets
train, test = data_resampled.randomSplit([0.8, 0.2], seed=random_state)

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Train the Decision Tree model without cross-validation
dt = DecisionTreeClassifier(featuresCol='features', labelCol='Malo_Dias_tot', seed=random_state)

# Create a pipeline to chain the assembler and Decision Tree together
pipeline = Pipeline(stages=[assembler, dt])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

# Define a UDF to extract the second element of the probability vector
probability_udf = udf(lambda v: float(v[1]), DoubleType())

# Add a 'score' column to the predictions DataFrame
predictions = predictions.withColumn('score', probability_udf('probability'))

# Calculate the KS statistic
data_pd = predictions.select('Malo_Dias_tot', 'score').toPandas()
data_pd['good'] = (data_pd['Malo_Dias_tot'] == 0).astype(int)
data_pd['bad'] = (data_pd['Malo_Dias_tot'] == 1).astype(int)
data_pd['bucket'] = (data_pd['score'].rank(pct=True) * 10).astype(int)
grouped = data_pd.groupby('bucket', as_index=True)
kstable = grouped.min().score.to_frame(name='min_score')
kstable['max_score'] = grouped.max().score
kstable['bads'] = grouped.sum().bad
kstable['goods'] = grouped.sum().good
kstable = kstable.reset_index()
kstable['bad_rate'] = kstable.bads / (kstable.bads + kstable.goods)
kstable['ks'] = (kstable.bads / kstable.bads.sum()).cumsum() - (kstable.goods / kstable.goods.sum()).cumsum()
ks_value = kstable.ks.abs().max()

print(f'KS Statistic: {ks_value:.3f}')

# Get the top 10 most important variables of the model
importances = model.stages[-1].featureImportances.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Set the seed for the random number generator
random_state = 0

# Convert the Spark DataFrame to a Pandas DataFrame
data_pd = df_2.toPandas()

# Define the feature columns
feature_cols = [col for col in data_pd.columns if col != 'Malo_Dias_tot']

# Extract the feature matrix and label vector
X = data_pd[feature_cols].values
y = data_pd['Malo_Dias_tot'].values

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create a SMOTE object
smote = SMOTE(random_state=random_state)

# Train the Decision Tree model without cross-validation
dt = DecisionTreeClassifier(random_state=random_state)

# Create a pipeline to chain SMOTE and Decision Tree together
pipeline = Pipeline([('smote', smote), ('dt', dt)])

# Fit the pipeline to the training data
model = pipeline.fit(X_train, y_train)

# Predict values for the test set
y_pred = model.predict(X_test)

# Calculate metrics
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using roc_auc_score function
auc = roc_auc_score(y_test, y_pred)
print(f'AUC: {auc}')

# Calculate the KS statistic
data_pd = pd.DataFrame({'Malo_Dias_tot': y_test, 'score': model.predict_proba(X_test)[:, 1]})
data_pd['good'] = (data_pd['Malo_Dias_tot'] == 0).astype(int)
data_pd['bad'] = (data_pd['Malo_Dias_tot'] == 1).astype(int)
data_pd['bucket'] = (data_pd['score'].rank(pct=True) * 10).astype(int)
grouped = data_pd.groupby('bucket', as_index=True)
kstable = grouped.min().score.to_frame(name='min_score')
kstable['max_score'] = grouped.max().score
kstable['bads'] = grouped.sum().bad
kstable['goods'] = grouped.sum().good
kstable = kstable.reset_index()
kstable['bad_rate'] = kstable.bads / (kstable.bads + kstable.goods)
kstable['ks'] = (kstable.bads / kstable.bads.sum()).cumsum() - (kstable.goods / kstable.goods.sum()).cumsum()
ks_value = kstable.ks.abs().max()

print(f'KS Statistic: {ks_value:.3f}')

# Get the top 10 most important variables of the model
importances = model.named_steps['dt'].feature_importances_
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))


In [None]:
import pandas as pd
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Set the seed for the random number generator
random_state = 0

# Convert the Spark DataFrame to a Pandas DataFrame
data_pd = df_2.toPandas()

# Define the feature columns
feature_cols = [col for col in data_pd.columns if col != 'Malo_Dias_tot']

# Extract the feature matrix and label vector
X = data_pd[feature_cols].values
y = data_pd['Malo_Dias_tot'].values

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create an ADASYN object
adasyn = ADASYN(random_state=random_state)

# Train the Decision Tree model without cross-validation
dt = DecisionTreeClassifier(random_state=random_state)

# Create a pipeline to chain ADASYN and Decision Tree together
pipeline = Pipeline([('adasyn', adasyn), ('dt', dt)])

# Fit the pipeline to the training data
model = pipeline.fit(X_train, y_train)

# Predict values for the test set
y_pred = model.predict(X_test)

# Calculate metrics
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using roc_auc_score function
auc = roc_auc_score(y_test, y_pred)
print(f'AUC: {auc}')

# Calculate the KS statistic
data_pd = pd.DataFrame({'Malo_Dias_tot': y_test, 'score': model.predict_proba(X_test)[:, 1]})
data_pd['good'] = (data_pd['Malo_Dias_tot'] == 0).astype(int)
data_pd['bad'] = (data_pd['Malo_Dias_tot'] == 1).astype(int)
data_pd['bucket'] = (data_pd['score'].rank(pct=True) * 10).astype(int)
grouped = data_pd.groupby('bucket', as_index=True)
kstable = grouped.min().score.to_frame(name='min_score')
kstable['max_score'] = grouped.max().score
kstable['bads'] = grouped.sum().bad
kstable['goods'] = grouped.sum().good
kstable = kstable.reset_index()
kstable['bad_rate'] = kstable.bads / (kstable.bads + kstable.goods)
kstable['ks'] = (kstable.bads / kstable.bads.sum()).cumsum() - (kstable.goods / kstable.goods.sum()).cumsum()
ks_value = kstable.ks.abs().max()

print(f'KS Statistic: {ks_value:.3f}')

# Get the top 10 most important variables of the model
importances = model.named_steps['dt'].feature_importances_
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))


In [None]:
import pandas as pd
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Set the seed for the random number generator
random_state = 0

# Convert the Spark DataFrame to a Pandas DataFrame
data_pd = df_2.toPandas()

# Define the feature columns
feature_cols = [col for col in data_pd.columns if col != 'Malo_Dias_tot']

# Extract the feature matrix and label vector
X = data_pd[feature_cols].values
y = data_pd['Malo_Dias_tot'].values

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create a TomekLinks object
tl = TomekLinks()

# Train the Decision Tree model without cross-validation
dt = DecisionTreeClassifier(random_state=random_state)

# Create a pipeline to chain TomekLinks and Decision Tree together
pipeline = Pipeline([('tl', tl), ('dt', dt)])

# Fit the pipeline to the training data
model = pipeline.fit(X_train, y_train)

# Predict values for the test set
y_pred = model.predict(X_test)

# Calculate metrics
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using roc_auc_score function
auc = roc_auc_score(y_test, y_pred)
print(f'AUC: {auc}')

# Calculate the KS statistic
data_pd = pd.DataFrame({'Malo_Dias_tot': y_test, 'score': model.predict_proba(X_test)[:, 1]})
data_pd['good'] = (data_pd['Malo_Dias_tot'] == 0).astype(int)
data_pd['bad'] = (data_pd['Malo_Dias_tot'] == 1).astype(int)
data_pd['bucket'] = (data_pd['score'].rank(pct=True) * 10).astype(int)
grouped = data_pd.groupby('bucket', as_index=True)
kstable = grouped.min().score.to_frame(name='min_score')
kstable['max_score'] = grouped.max().score
kstable['bads'] = grouped.sum().bad
kstable['goods'] = grouped.sum().good
kstable = kstable.reset_index()
kstable['bad_rate'] = kstable.bads / (kstable.bads + kstable.goods)
kstable['ks'] = (kstable.bads / kstable.bads.sum()).cumsum() - (kstable.goods / kstable.goods.sum()).cumsum()
ks_value = kstable.ks.abs().max()

print(f'KS Statistic: {ks_value:.3f}')

# Get the top 10 most important variables of the model
importances = model.named_steps['dt'].feature_importances_
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))


In [None]:
df_red = df_2.toPandas()
columnas = df_red.columns.tolist()
columnas.remove('Malo_Dias_tot')
columnas.append('Malo_Dias_tot')
df_red = df_red.reundex(columns=columnas)

X = df_red.iloc[:,0:27].values
y = df_red.iloc[:, 27].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

sc_X = StandarScaled()

X_train = sc_x.fit_transform(X_train)
X_test = sc_X.transform(x_test)

X_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1],1))
X_test = np.reshape(x_test, (x_test.shpe[0], x_test.shape[1], 1))

model = Sequential()
model.add(LSTM(30, input_shape=(x_train.shape[1], 1)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

class_weight = {0: 0.5, 1: 2.5}

model.fit(X_train, y_train, epochs=20,class_weight=class_weight)

loss, accuracy = model.evaluate(X_test,y_test)

y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

f1 = f1_score(y_test, y_pred_classes)
accuracy = accuracy_score(y_test, y_pred_classes)
confusion = confusion_matrix(y_tet, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)














In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

df_red = df_2.toPandas()
columnas = df_red.columns.tolist()
columnas.remove('Malo_Dias_tot')
columnas.append('Malo_Dias_tot')
df_red = df_red.reindex(columns=columnas)

X = df_red.iloc[:,0:27].values
y = df_red.iloc[:, 27].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1],1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

model = Sequential()
model.add(LSTM(30, input_shape=(X_train.shape[1], 1)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

class_weight = {0: 0.5, 1: 2.5}

model.fit(X_train, y_train, epochs=20,class_weight=class_weight)

loss, accuracy = model.evaluate(X_test,y_test)

y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

f1 = f1_score(y_test, y_pred_classes)
accuracy = accuracy_score(y_test, y_pred_classes)
confusion = confusion_matrix(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)
roc_auc = roc_auc_score(y_test, y_pred)

sensitivities = np.zeros(X_test.shape[1])
for i in range(X_test.shape[1]):
    X_test_perturbed = X_test.copy()
    X_test_perturbed[:, i] += np.std(X_test[:, i])
    y_pred_perturbed = model.predict(X_test_perturbed)
    sensitivities[i] = np.mean(np.abs(y_pred_perturbed - y_pred))
sorted_idx = np.argsort(sensitivities)[::-1]

# Imprimir el ranking de características
print("Ranking de características:")
for i in sorted_idx:
    print(f"{i}. Característica {i} ({sensitivities[i]:.3f})")

print(f'F1 Score: {f1:.3f}')
print(f'Accuracy: {accuracy:.3f}')
print(f'Confusion Matrix:\n{confusion}')
print(f'Recall: {recall:.3f}')
print(f'ROC-AUC: {roc_auc:.3f}')


In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

df_red = df_2.toPandas()
columnas = df_red.columns.tolist()
columnas.remove('Malo_Dias_tot')
columnas.append('Malo_Dias_tot')
df_red = df_red.reindex(columns=columnas)

X = df_red.iloc[:,0:27].values
y = df_red.iloc[:, 27].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1],1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

model = Sequential()
model.add(LSTM(30, input_shape=(X_train.shape[1], 1)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

class_weight = {0: 0.5, 1: 2.5}

model.fit(X_train, y_train, epochs=20,class_weight=class_weight)

loss, accuracy = model.evaluate(X_test,y_test)

y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

f1 = f1_score(y_test, y_pred_classes)
accuracy = accuracy_score(y_test, y_pred_classes)
confusion = confusion_matrix(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)
roc_auc = roc_auc_score(y_test, y_pred)

sensitivities = np.zeros(X_test.shape[1])
for i in range(X_test.shape[1]):
    X_test_perturbed = X_test.copy()
    X_test_perturbed[:, i] += np.std(X_test[:, i])
    y_pred_perturbed = model.predict(X_test_perturbed)
    sensitivities[i] = np.mean(np.abs(y_pred_perturbed - y_pred))
sorted_idx = np.argsort(sensitivities)[::-1]

# Imprimir el ranking de características
print("Ranking de características:")
for i in sorted_idx:
    print(f"{i}. Característica {i} ({sensitivities[i]:.3f})")

# Calculate the KS statistic
data_pd = pd.DataFrame({'Malo_Dias_tot': y_test,
                        'score': model.predict(X_test).ravel()})
data_pd['good'] = (data_pd['Malo_Dias_tot'] == 0).astype(int)
data_pd['bad'] = (data_pd['Malo_Dias_tot'] == 1).astype(int)
data_pd['bucket'] = (data_pd['score'].rank(pct=True) * 10).astype(int)
grouped = data_pd.groupby('bucket', as_index=True)
kstable = grouped.min().score.to_frame(name='min_score')
kstable['max_score'] = grouped.max().score
kstable['bads'] = grouped.sum().bad
kstable['goods'] = grouped.sum().good
kstable = kstable.reset_index()
kstable['bad_rate'] = kstable.bads / (kstable.bads + kstable.goods)
kstable['ks'] = (kstable.bads / kstable.bads.sum()).cumsum() - \
                (kstable.goods / kstable.goods.sum()).cumsum()
ks_value = kstable.ks.abs().max()

print(f'KS Statistic: {ks_value:.3f}')
print(f'F1 Score: {f1:.3f}')
print(f'Accuracy: {accuracy:.3f}')
print(f'Confusion Matrix:\n{confusion}')
print(f'Recall: {recall:.3f}')
print(f'ROC-AUC: {roc_auc:.3f}')


In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense

df_red = df_2.toPandas()
columnas = df_red.columns.tolist()
columnas.remove('Malo_Dias_tot')
columnas.append('Malo_Dias_tot')
df_red = df_red.reindex(columns=columnas)

X = df_red.iloc[:,0:27].values
y = df_red.iloc[:, 27].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

model = Sequential()
model.add(Dense(30, input_shape=(X_train.shape[1],)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

class_weight = {0: 0.5, 1: 2.5}

model.fit(X_train, y_train, epochs=20,class_weight=class_weight)

loss, accuracy = model.evaluate(X_test,y_test)

y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)

f1 = f1_score(y_test, y_pred_classes)
accuracy = accuracy_score(y_test, y_pred_classes)
confusion = confusion_matrix(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)
roc_auc = roc_auc_score(y_test, y_pred)

sensitivities = np.zeros(X_test.shape[1])
for i in range(X_test.shape[1]):
    X_test_perturbed = X_test.copy()
    X_test_perturbed[:, i] += np.std(X_test[:, i])
    y_pred_perturbed = model.predict(X_test_perturbed)
    sensitivities[i] = np.mean(np.abs(y_pred_perturbed - y_pred))
sorted_idx = np.argsort(sensitivities)[::-1]

# Imprimir el ranking de características
print("Ranking de características:")
for i in sorted_idx:
    print(f"{i}. Característica {i} ({sensitivities[i]:.3f})")

# Calculate the KS statistic
data_pd = pd.DataFrame({'Malo_Dias_tot': y_test,
                        'score': model.predict(X_test).ravel()})
data_pd['good'] = (data_pd['Malo_Dias_tot'] == 0).astype(int)
data_pd['bad'] = (data_pd['Malo_Dias_tot'] == 1).astype(int)
data_pd['bucket'] = (data_pd['score'].rank(pct=True) * 10).astype(int)
grouped = data_pd.groupby('bucket', as_index=True)
kstable = grouped.min().score.to_frame(name='min_score')
kstable['max_score'] = grouped.max().score
kstable['bads'] = grouped.sum().bad
kstable['goods'] = grouped.sum().good
kstable = kstable.reset_index()
kstable['bad_rate'] = kstable.bads / (kstable.bads + kstable.goods)
kstable['ks'] = (kstable.bads / kstable.bads.sum()).cumsum() - \
                (kstable.goods / kstable.goods.sum()).cumsum()
ks_value = kstable.ks.abs().max()

print(f'KS Statistic: {ks_value:.3f}')
print(f'F1 Score: {f1:.3f}')
print(f'Accuracy: {accuracy:.3f}')
print(f'Confusion Matrix:\n{confusion}')
print(f'Recall: {recall:.3f}')
print(f'ROC-AUC: {roc_auc:.3f}')


In [None]:
import matplotlib.pyplot as plt
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN

# Cargar datos
# df_2 = ...

# Convertir DataFrame de PySpark a DataFrame de pandas
df_2 = df_2.toPandas()

X = df_2.iloc[:, df_2.columns != 'Malo_Dias_tot']
y = df_2.iloc[:, df_2.columns == 'Malo_Dias_tot']

# Aplicar técnicas de remuestreo
tl = TomekLinks(sampling_strategy='auto')
X_tl, y_tl = tl.fit_resample(X, y)

rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

adasyn = ADASYN(sampling_strategy='auto', random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)

# Graficar resultados
fig, axes = plt.subplots(2, 2, figsize=(10, 10))

ax1, ax2, ax3, ax4 = axes.flatten()

ax1.bar(['0', '1'], [sum(y == 0), sum(y == 1)])
ax1.set_title(f'Original (Total: {len(y)})')

ax2.bar(['0', '1'], [sum(y_tl == 0), sum(y_tl == 1)])
ax2.set_title(f'Tomek Links (Total: {len(y_tl)})')

ax3.bar(['0', '1'], [sum(y_rus == 0), sum(y_rus == 1)])
ax3.set_title(f'Random Undersampling (Total: {len(y_rus)})')

ax4.bar(['0', '1'], [sum(y_smote == 0), sum(y_smote == 1)])
ax4.set_title(f'SMOTE (Total: {len(y_smote)})')

plt.show()


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))
train, test = df8.randomSplit([0.7, 0.3], seed=12345)

# Definir características y etiqueta
features = df8.columns
features.remove('label')
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Crear el modelo de Random Forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=0)

# Crear el pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Definir la cuadrícula de parámetros para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()

# Definir el evaluador para la validación cruzada
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Crear el objeto de validación cruzada
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator)

# Ajustar el modelo en los datos de entrenamiento
model = cv.fit(train)

# Realizar predicciones en los datos de prueba
predictions = model.transform(test)

# Calcular las métricas ROC-AUC y precisión
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calcular la matriz de confusión
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Matriz de confusión:\n{confusion_matrix}")

# Calcular el recall y la puntuación F1 manualmente
TP = confusion_matrix[1, 1]
FP = confusion_matrix[0, 1]
FN = confusion_matrix[1, 0]
precision_manual = TP / (TP + FP)
recall_manual = TP / (TP + FN)
f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)
print(f"Recall (calculado manualmente): {recall_manual:.3f}")
print(f"F1 (calculado manualmente): {f1_manual:.3f}")

# Obtener las variables más importantes
importances = model.bestModel.stages[-1].featureImportances
important_features = sorted(zip(importances, features), reverse=True)
print("Variables más importantes:")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['score'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().score.to_frame(name='min_score')
    kstable['max_score']=grouped.max().score
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

score_udf=udf(lambda v:float(v[0]),DoubleType())
predictions=predictions.withColumn('score',score_udf('rawPrediction'))
ks_value=calc_ks(predictions)
print(f"Estadístico KS: {ks_value:.3f}")



In [None]:
from imblearn.under_sampling import NearMiss
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import pandas as pd

df8 = df_2.withColumnRenamed('Malo_Dias_tot', 'label')
df8 = df8.withColumn("label", col("label").cast(DoubleType()))

# Set the seed for reproducibility
seed = 12345

# Split the data into training and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=seed)

# Convert the training data to a Pandas DataFrame
train_pd = train.toPandas()

# Define features and label
features = df8.columns
features.remove('label')

# Separate the features and label
X = train_pd[features]
y = train_pd['label']

# Perform NearMiss undersampling
nm = NearMiss()
X_resampled, y_resampled = nm.fit_resample(X, y)

# Convert the resampled data back to a PySpark DataFrame
train_undersampled_pd = pd.concat([X_resampled, y_resampled], axis=1)
train_undersampled = spark.createDataFrame(train_undersampled_pd)

assembler = VectorAssembler(inputCols=features, outputCol="features")

# Create the Random Forest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=seed)

# Create the pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Define the parameter grid for cross-validation
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()

# Define the evaluator for cross-validation
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

# Create the cross-validator object
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator)

# Fit the model on the undersampled training data
model = cv.fit(train_undersampled)

# Make predictions on the test data
predictions = model.transform(test)

# Calculate ROC-AUC and accuracy metrics
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
roc_auc = evaluator.evaluate(predictions)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Calculate the confusion matrix
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Confusion matrix:\n{confusion_matrix}")

# Manually calculate recall and F1 score
TP = confusion_matrix[1, 1]
FP = confusion_matrix[0, 1]
FN = confusion_matrix[1, 0]
precision_manual = TP / (TP + FP)
recall_manual = TP / (TP + FN)
f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)
print(f"Recall (manually calculated): {recall_manual:.3f}")
print(f"F1 (manually calculated): {f1_manual:.3f}")

# Get the most important features
importances = model.bestModel.stages[-1].featureImportances
important_features = sorted(zip(importances, features), reverse=True)
print("Most important features:")
for importance, feature in important_features:
    print(f"{feature}: {importance:.3f}")

def calc_ks(data):
    data_pd=data.toPandas()
    data_pd['good']=(data_pd['label']==0).astype(int)
    data_pd['bad']=(data_pd['label']==1).astype(int)
    data_pd['bucket']=(data_pd['score'].rank(pct=True)*10).astype(int)
    grouped=data_pd.groupby('bucket',as_index=True)
    kstable=grouped.min().score.to_frame(name='min_score')
    kstable['max_score']=grouped.max().score
    kstable['bads']=grouped.sum().bad
    kstable['goods']=grouped.sum().good
    kstable=kstable.reset_index()
    kstable['bad_rate']=kstable.bads/(kstable.bads+kstable.goods)
    kstable['ks']=(kstable.bads/kstable.bads.sum()).cumsum()-(kstable.goods/kstable.goods.sum()).cumsum()
    ks_value=kstable.ks.abs().max()
    return ks_value

score_udf=udf(lambda v:float(v[0]),DoubleType())
predictions=predictions.withColumn('score',score_udf('rawPrediction'))
ks_value=calc_ks(predictions)
print(f"KS statistic: {ks_value:.3f}")
