<a href="https://colab.research.google.com/github/CristValen/ML-Python/blob/main/python_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#svm

from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Estandarizar las variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Entrenar el modelo SVM con validación cruzada
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC(probability=True, random_state=random_state)
clf = GridSearchCV(svc, parameters)
clf.fit(X_train_scaled, y_train)

# Predecir valores para el conjunto de test
y_pred = clf.predict(X_test_scaled)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(clf.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Estandarizar las variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Aplicar SMOTE al conjunto de entrenamiento
sm = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

# Entrenar el modelo SVM
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = clf.predict(X_test_scaled)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))

#


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

#undersampling
# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Aplicar RandomUnderSampler al conjunto de entrenamiento
rus = RandomUnderSampler(random_state=random_state)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Crear una instancia de StandardScaler
scaler = StandardScaler()

# Ajustar el escalador a los datos de entrenamiento
scaler.fit(X_train_resampled)

# Transformar los datos de entrenamiento y prueba
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Entrenar el modelo SVM con validación cruzada
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
grid = GridSearchCV(svm.SVC(kernel='linear', probability=True, random_state=random_state), param_grid, refit=True)
grid.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = grid.predict(X_test)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, grid.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(grid.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Aplicar ADASYN al conjunto de entrenamiento
adasyn = ADASYN(random_state=random_state)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Crear una instancia de StandardScaler
scaler = StandardScaler()

# Ajustar el escalador a los datos de entrenamiento
scaler.fit(X_train_resampled)

# Transformar los datos de entrenamiento y prueba
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Entrenar el modelo SVM con validación cruzada
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
grid = GridSearchCV(svm.SVC(kernel='linear', probability=True, random_state=random_state), param_grid, refit=True)
grid.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = grid.predict(X_test)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, grid.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(grid.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Aplicar TomekLinks al conjunto de entrenamiento
tl = TomekLinks()
X_train_resampled, y_train_resampled = tl.fit_resample(X_train, y_train)

# Crear una instancia de StandardScaler
scaler = StandardScaler()

# Ajustar el escalador a los datos de entrenamiento
scaler.fit(X_train_resampled)

# Transformar los datos de entrenamiento y prueba
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Definir los parámetros para la búsqueda en grilla
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

# Crear el objeto GridSearchCV
grid_search = GridSearchCV(svm.SVC(probability=True, random_state=random_state), param_grid, cv=5)

# Entrenar el modelo SVM con búsqueda en grilla
grid_search.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = grid_search.predict(X_test)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, grid_search.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(grid_search.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))



In [None]:
#sin cross validation
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Standardize the variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM model without cross-validation
svc = svm.SVC(probability=True, random_state=random_state)
svc.fit(X_train_scaled, y_train)

# Predict values for the test set
y_pred = svc.predict(X_test_scaled)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, svc.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(svc.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Standardize the variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to the training set
sm = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test_scaled)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply RandomUnderSampler to the training set
rus = RandomUnderSampler(random_state=random_state)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply ADASYN to the training set
adasyn = ADASYN(random_state=random_state)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply TomekLinks to the training set
tl = TomekLinks()
X_train_resampled, y_train_resampled = tl.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
#random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train)

# Transform the training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train, y_train)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply SMOTE to the training set
sm = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply RandomUnderSampler to the training set
rus = RandomUnderSampler(random_state=random_state)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply ADASYN to the training set
adasyn = ADASYN(random_state=random_state)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
### decision tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train)

# Transform the training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Train the Decision Tree model without cross-validation
clf = DecisionTreeClassifier(random_state=random_state)
clf.fit(X_train, y_train)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
###pyspark svm
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
train, test = df_2.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

# Calculate KS statistic using BinaryClassificationEvaluator with areaUnderPR metric
evaluator.setMetricName('areaUnderPR')
ks = evaluator.evaluate(predictions)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))

# Calculate and display ROC curve
pdf = predictions.select('Malo_Dias_tot', 'rawPrediction').toPandas()
fpr, tpr, thresholds = roc_curve(pdf['Malo_Dias_tot'], pdf['rawPrediction'].apply(lambda x: x[1]))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()





In [None]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
train, test = df_2.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Perform random undersampling to balance the classes in the training data
majority_class = train.groupby('Malo_Dias_tot').count().orderBy('count', ascending=False).first()[0]
minority_class_count = train.filter(train.Malo_Dias_tot != majority_class).count()
undersampled_train = train.filter(train.Malo_Dias_tot == majority_class).sample(False, float(minority_class_count) / train.filter(train.Malo_Dias_tot == majority_class).count(), seed=random_state)
undersampled_train = undersampled_train.union(train.filter(train.Malo_Dias_tot != majority_class))

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(undersampled_train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

# Calculate KS statistic using BinaryClassificationEvaluator with areaUnderPR metric
evaluator.setMetricName('areaUnderPR')
ks = evaluator.evaluate(predictions)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))

# Calculate and display ROC curve
pdf = predictions.select('Malo_Dias_tot', 'rawPrediction').toPandas()
fpr, tpr, thresholds = roc_curve(pdf['Malo_Dias_tot'], pdf['rawPrediction'].apply(lambda x: x[1]))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()



In [None]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from imblearn.over_sampling import ADASYN

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
train, test = df_2.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Perform ADASYN oversampling to balance the classes in the training data
adasyn = ADASYN(random_state=random_state)
train_pd = train.toPandas()
X_resampled, y_resampled = adasyn.fit_resample(train_pd[feature_cols], train_pd['Malo_Dias_tot'])
oversampled_train = spark.createDataFrame(pd.concat([X_resampled, y_resampled], axis=1))

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(oversampled_train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate AUC using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='Malo_Dias_tot', rawPredictionCol='rawPrediction', metricName='areaUnderROC')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc}')

# Calculate KS statistic using BinaryClassificationEvaluator with areaUnderPR metric
evaluator.setMetricName('areaUnderPR')
ks = evaluator.evaluate(predictions)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))

# Calculate and display ROC curve
pdf = predictions.select('Malo_Dias_tot', 'rawPrediction').toPandas()
fpr, tpr, thresholds = roc_curve(pdf['Malo_Dias_tot'], pdf['rawPrediction'].apply(lambda x: x[1]))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()


In [None]:
from imblearn.over_sampling import SMOTE
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
import pandas as pd

# Set the dependent variable
label = "label"

# Rename the Malos_Dias_tot column to label and cast it to a double type
df8 = df_2.withColumnRenamed("Malos_Dias_tot", label)
df8 = df8.withColumn(label, col(label).cast(DoubleType()))

# Split the data into training and test sets
train, test = df8.randomSplit([0.7, 0.3], seed=0)

# Set the independent variables
features = train.columns
features.remove(label)

# Convert the training data to a pandas DataFrame
train_pd = train.toPandas()

# Create the SMOTE object
smote = SMOTE()

# Apply SMOTE to the training data
X_resampled, y_resampled = smote.fit_resample(train_pd[features], train_pd[label])

# Convert the resampled data to a pandas DataFrame
train_resampled_pd = pd.DataFrame(X_resampled, columns=features)
train_resampled_pd[label] = y_resampled

# Convert the resampled data back to a PySpark DataFrame
train_resampled = spark.createDataFrame(train_resampled_pd)

# Assemble the feature vector
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Create the GBTClassifier model
gbt = GBTClassifier(labelCol=label, featuresCol="features")

# Create the pipeline
pipeline = Pipeline(stages=[assembler, gbt])

# Create the parameter grid for cross-validation
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [2, 4, 6]) \
    .addGrid(gbt.maxBins, [20, 60]) \
    .addGrid(gbt.maxIter, [10, 20]) \
    .build()

# Create the evaluators for cross-validation
evaluator1 = BinaryClassificationEvaluator(labelCol=label)
evaluator2 = MulticlassClassificationEvaluator(labelCol=label)

# Create the cross-validator
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator1,
                    numFolds=5)

# Fit the model to the training data
cvModel = cv.fit(train_resampled)

# Make predictions on the test data
predictions = cvModel.transform(test)

# Calculate evaluation metrics
accuracy = evaluator2.evaluate(predictions, {evaluator2.metricName: "accuracy"})
f1 = evaluator2.evaluate(predictions, {evaluator2.metricName: "f1"})
recall = evaluator2.evaluate(predictions, {evaluator2.metricName: "weightedRecall"})
roc_auc = evaluator1.evaluate(predictions)
confusion_matrix = predictions.groupBy(label).pivot("prediction").count().na.fill(0).orderBy(label).collect()

# Select the probability and label columns
preds_and_labels = predictions.select(['probability', label])

# Extract the probability of the positive class
preds_and_labels = preds_and_labels.withColumn('prob_of_positive', col('probability')[1])

# Convert to pandas DataFrame
preds_and_labels_pd = preds_and_labels.toPandas()

# Calculate the KS statistic
ks_stat = preds_and_labels_pd.groupby(label)['prob_of_positive'].apply(lambda x: x.mean())
ks_value = ks_stat.diff().abs().max()

print("Accuracy:", accuracy)
print("F1:", f1)
print("Recall:", recall)
print("ROC-AUC:", roc_auc)
print("Confusion Matrix:", confusion_matrix)
print("KS:", ks_value)

In [None]:
Himport pandas as pd
import numpy as np

def calc_iv(df, target):
    """
    Calcula el Information Value (IV) de todas las variables en un DataFrame.

    Parámetros:
    df: DataFrame de pandas que contiene los datos.
    target: Nombre de la columna que contiene la variable objetivo.

    Retorna:
    iv_df: DataFrame que contiene el IV para cada variable.
    """
    # Crear un DataFrame vacío para almacenar los resultados
    iv_df = pd.DataFrame(columns=['Variable', 'IV'])

    # Iterar sobre todas las columnas en el DataFrame
    for col in df.columns:
        if col != target:
            # Calcular la frecuencia de cada valor en la variable
            freq_df = df.groupby(col)[target].agg(['count', 'sum'])

            # Calcular el porcentaje de eventos y no eventos en cada grupo
            freq_df['event_rate'] = freq_df['sum'] / freq_df['sum'].sum()
            freq_df['non_event_rate'] = (freq_df['count'] - freq_df['sum']) / (freq_df['count'].sum() - freq_df['sum'].sum())

            # Calcular el WOE para cada grupo
            freq_df['woe'] = np.log(freq_df['non_event_rate'] / freq_df['event_rate'])

            # Calcular el IV para la variable
            iv = ((freq_df['non_event_rate'] - freq_df['event_rate']) * freq_df['woe']).sum()

            # Agregar el resultado al DataFrame de resultados
            iv_df = iv_df.append({'Variable': col, 'IV': iv}, ignore_index=True)

    return iv_df

# Especificar la columna de etiquetas
label_col = "label_column"

# Calcular el IV para todas las variables
iv_df = calc_iv(df, label_col)

# Mostrar los resultados
print(iv_df)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
from pyspark.sql.functions import col, when

# Dividir los datos en conjuntos de entrenamiento y prueba
train, test = df_2.randomSplit([0.7, 0.3], seed=12345)

# Undersampling para tratar el desbalanceo de clases
ratio = train.where(col('Malo_dias_tot') == 1).count() / train.where(col('Malo_dias_tot') == 0).count()
train = train.sampleBy('Malo_dias_tot', fractions={0: ratio, 1: 1.0}, seed=12345)

# Definir las etapas del pipeline
stages = []
categoricalColumns = [c for c in train.columns if c != 'Malo_dias_tot' and train.schema[c].dataType == StringType()]
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    stages += [stringIndexer]
label_stringIdx = StringIndexer(inputCol="Malo_dias_tot", outputCol="label")
stages += [label_stringIdx]
numericCols = [c for c in train.columns if c != 'Malo_dias_tot' and train.schema[c].dataType != StringType()]
assemblerInputs = [c + "Index" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
rf = RandomForestClassifier(featuresCol="features", labelCol="label")
stages += [rf]
pipeline = Pipeline(stages=stages)

# Definir la cuadrícula de parámetros y el evaluador para la validación cruzada
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 20]).build()
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")

# Definir el objeto de validación cruzada
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Ajustar el modelo con validación cruzada
cvModel = crossval.fit(train)

# Realizar predicciones en el conjunto de prueba
predictions = cvModel.transform(test)

# Calcular métricas de evaluación
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = BinaryClassificationMetrics(predictionAndLabels)
print("KS:", metrics.areaUnderROC)
metrics = MulticlassMetrics(predictionAndLabels)
print("Confusion matrix:", metrics.confusionMatrix().toArray())
print("Accuracy:", metrics.accuracy)
print("Recall:", metrics.recall(1))
print("F1:", metrics.fMeasure(1.0))
evaluator.setMetricName("areaUnderROC")
print("ROC-AUC:", evaluator.evaluate(predictions))

# Mostrar las variables más importantes ordenadas por importancia
featureImportances = cvModel.bestModel.stages[-1].featureImportances.toArray()
for idx in featureImportances.argsort()[::-1]:
    print(assemblerInputs[idx], featureImportances[idx])
