<a href="https://colab.research.google.com/github/CristValen/ML-Python/blob/main/python_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#svm

from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Estandarizar las variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Entrenar el modelo SVM con validación cruzada
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC(probability=True, random_state=random_state)
clf = GridSearchCV(svc, parameters)
clf.fit(X_train_scaled, y_train)

# Predecir valores para el conjunto de test
y_pred = clf.predict(X_test_scaled)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(clf.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Estandarizar las variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Aplicar SMOTE al conjunto de entrenamiento
sm = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

# Entrenar el modelo SVM
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = clf.predict(X_test_scaled)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))

#


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

#undersampling
# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Aplicar RandomUnderSampler al conjunto de entrenamiento
rus = RandomUnderSampler(random_state=random_state)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Crear una instancia de StandardScaler
scaler = StandardScaler()

# Ajustar el escalador a los datos de entrenamiento
scaler.fit(X_train_resampled)

# Transformar los datos de entrenamiento y prueba
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Entrenar el modelo SVM con validación cruzada
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
grid = GridSearchCV(svm.SVC(kernel='linear', probability=True, random_state=random_state), param_grid, refit=True)
grid.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = grid.predict(X_test)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, grid.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(grid.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Aplicar ADASYN al conjunto de entrenamiento
adasyn = ADASYN(random_state=random_state)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Crear una instancia de StandardScaler
scaler = StandardScaler()

# Ajustar el escalador a los datos de entrenamiento
scaler.fit(X_train_resampled)

# Transformar los datos de entrenamiento y prueba
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Entrenar el modelo SVM con validación cruzada
param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
grid = GridSearchCV(svm.SVC(kernel='linear', probability=True, random_state=random_state), param_grid, refit=True)
grid.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = grid.predict(X_test)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, grid.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(grid.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Establecer la semilla para el generador de números aleatorios
random_state = 42

# Convertir el DataFrame de PySpark a Pandas
pandas_df = df_2.toPandas()

# Dividir el conjunto de datos en train y test
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Aplicar TomekLinks al conjunto de entrenamiento
tl = TomekLinks()
X_train_resampled, y_train_resampled = tl.fit_resample(X_train, y_train)

# Crear una instancia de StandardScaler
scaler = StandardScaler()

# Ajustar el escalador a los datos de entrenamiento
scaler.fit(X_train_resampled)

# Transformar los datos de entrenamiento y prueba
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Definir los parámetros para la búsqueda en grilla
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

# Crear el objeto GridSearchCV
grid_search = GridSearchCV(svm.SVC(probability=True, random_state=random_state), param_grid, cv=5)

# Entrenar el modelo SVM con búsqueda en grilla
grid_search.fit(X_train_resampled, y_train_resampled)

# Predecir valores para el conjunto de test
y_pred = grid_search.predict(X_test)

# Calcular métricas
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calcular el estadístico KS
fpr, tpr, thresholds = roc_curve(y_test, grid_search.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Obtener las 10 variables más importantes del modelo
feature_importances = pd.DataFrame(grid_search.best_estimator_.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))



In [None]:
#sin cross validation
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Standardize the variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM model without cross-validation
svc = svm.SVC(probability=True, random_state=random_state)
svc.fit(X_train_scaled, y_train)

# Predict values for the test set
y_pred = svc.predict(X_test_scaled)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, svc.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(svc.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))




In [None]:
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Standardize the variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to the training set
sm = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test_scaled)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply RandomUnderSampler to the training set
rus = RandomUnderSampler(random_state=random_state)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply ADASYN to the training set
adasyn = ADASYN(random_state=random_state)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 42

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malos_Dias_tot', axis=1)
y = pandas_df['Malos_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply TomekLinks to the training set
tl = TomekLinks()
X_train_resampled, y_train_resampled = tl.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the SVM model without cross-validation
clf = svm.SVC(kernel='linear', probability=True, random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
#random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train)

# Transform the training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train, y_train)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply SMOTE to the training set
sm = SMOTE(random_state=random_state)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply RandomUnderSampler to the training set
rus = RandomUnderSampler(random_state=random_state)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Apply ADASYN to the training set
adasyn = ADASYN(random_state=random_state)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train_resampled)

# Transform the training and test data
X_train_resampled = scaler.transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Train the Random Forest model without cross-validation
clf = RandomForestClassifier(random_state=random_state)
clf.fit(X_train_resampled, y_train_resampled)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
### decision tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
X = pandas_df.drop('Malo_Dias_tot', axis=1)
y = pandas_df['Malo_Dias_tot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data
scaler.fit(X_train)

# Transform the training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Train the Decision Tree model without cross-validation
clf = DecisionTreeClassifier(random_state=random_state)
clf.fit(X_train, y_train)

# Predict values for the test set
y_pred = clf.predict(X_test)

# Calculate metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the KS statistic
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
ks = np.max(tpr - fpr)
print(f'KS: {ks}')

# Get the top 10 most important variables of the model
feature_importances = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances.head(10))


In [None]:
###pyspark svm

from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from sklearn.metrics import roc_curve

# Set the seed for the random number generator
random_state = 0

# Convert the PySpark DataFrame to Pandas
pandas_df = df_2.toPandas()

# Split the dataset into train and test sets
train, test = df_2.randomSplit([0.8, 0.2], seed=random_state)

# Define the feature columns
feature_cols = [col for col in train.columns if col != 'Malo_Dias_tot']

# Create a VectorAssembler to combine the feature columns into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# Create a StandardScaler to standardize the features
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Train the Support Vector Machine model without cross-validation
svm = LinearSVC(featuresCol='scaledFeatures', labelCol='Malo_Dias_tot', maxIter=10, regParam=0.1)

# Create a pipeline to chain the assembler, scaler and SVM together
pipeline = Pipeline(stages=[assembler, scaler, svm])

# Fit the pipeline to the training data
model = pipeline.fit(train)

# Predict values for the test set
predictions = model.transform(test)

# Calculate metrics
tp = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.Malo_Dias_tot == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.Malo_Dias_tot == 1) & (predictions.prediction == 0)].count()

print(f'Confusion Matrix:\n[[{tn} {fp}]\n [{fn} {tp}]]')

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')

# Calculate the KS statistic
pdf = predictions.select('Malo_Dias_tot', 'rawPrediction').toPandas()
fpr, tpr, thresholds = roc_curve(pdf['Malo_Dias_tot'], pdf['rawPrediction'].apply(lambda x: x[1]))
ks = max(tpr - fpr)

print(f'KS: {ks}')

# Get the top 10 most important variables of the model
importances = model.stages[-1].coefficients.toArray()
importance_df = pd.DataFrame(list(zip(feature_cols, importances)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)
print(importance_df.head(10))

