In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

engine = create_engine('postgresql://postgres:postgres@localhost:5432/bank_churners')
query = "SELECT * FROM new_bank_churners_data"
df_from_sql = pd.read_sql(query, engine)

In [3]:
# Separar las características (X) y la variable objetivo (y)
X = df_from_sql.drop(columns=['attrition_flag_existing_customer'])
y = df_from_sql['attrition_flag_existing_customer']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
#Normalize the data
X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)

In [8]:
# Parámetros para iterar
n_estimators_options = [50, 100, 200]
max_depth_options = [None, 10, 20, 30]
k_options = [3, 5, 7, 9]


In [9]:
# Lista para guardar resultados
results = []

In [10]:

# Iterar sobre diferentes combinaciones de hiperparámetros para Random Forest
for n_estimators in n_estimators_options:
    for max_depth in max_depth_options:
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        clf.fit(X_train, y_train)
        
        # Evaluar la precisión en el conjunto de entrenamiento
        y_train_pred = clf.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        
        # Evaluar la precisión en el conjunto de prueba
        y_test_pred = clf.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        # Guardar resultados en la lista
        results.append({
            'model': 'RandomForest',
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'k': None,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy
        })

In [11]:
# Iterar sobre diferentes valores de k para KNN
for k in k_options:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    
    # Evaluar la precisión en el conjunto de entrenamiento
    y_train_pred = knn.predict(X_train_scaled)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    
    # Evaluar la precisión en el conjunto de prueba
    y_test_pred = knn.predict(X_test_scaled)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # Guardar resultados en la lista
    results.append({
        'model': 'KNN',
        'n_estimators': None,
        'max_depth': None,
        'k': k,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy
    })

In [12]:
# Convertir la lista de resultados a DataFrame
results_df = pd.DataFrame(results)

In [13]:
# Exportar resultados a CSV
results_df.to_csv('model_comparison_results.csv', index=False)

In [14]:
# Mostrar el rendimiento del mejor modelo
best_model = results_df.loc[results_df['test_accuracy'].idxmax()]
print(f"Best Model: {best_model['model']} with parameters:")
if best_model['model'] == 'RandomForest':
    print(f"  n_estimators: {best_model['n_estimators']}")
    print(f"  max_depth: {best_model['max_depth']}")
else:
    print(f"  k: {best_model['k']}")
print(f"Train Accuracy: {best_model['train_accuracy']}")
print(f"Test Accuracy: {best_model['test_accuracy']}")

Best Model: RandomForest with parameters:
  n_estimators: 50.0
  max_depth: nan
Train Accuracy: 1.0
Test Accuracy: 1.0
