In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load the data
archivo = 'Taller_2_Titulacion_DatosTaller.csv'
datos = pd.read_csv(archivo, encoding='latin-1', delimiter=';')

# Separate features and target variable
X = datos.iloc[:, :-1]
y = datos.iloc[:, -1]

# Drop the "Id" column
X = X.drop('Id', axis=1)

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create preprocessing pipelines for numerical and categorical data
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Preprocess the data
X_processed = preprocessor.fit_transform(X)

# Convert target variable to numerical (binary encoding)
y = pd.get_dummies(y, drop_first=True).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Convert to float32
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

# Build the neural network model with adjusted architecture and hyperparameters
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model with adjusted learning rate
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=15, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=200, batch_size=64, validation_split=0.2, callbacks=[early_stopping], verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f'Loss: {loss:.4f}')
print(f'Accuracy: {accuracy:.4f}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.6539 - loss: 0.6230 - val_accuracy: 0.7391 - val_loss: 0.5029
Epoch 2/200
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7204 - loss: 0.5437 - val_accuracy: 0.7391 - val_loss: 0.4950
Epoch 3/200
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7567 - loss: 0.5127 - val_accuracy: 0.7329 - val_loss: 0.5014
Epoch 4/200
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7702 - loss: 0.4915 - val_accuracy: 0.7536 - val_loss: 0.4906
Epoch 5/200
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7749 - loss: 0.4504 - val_accuracy: 0.7453 - val_loss: 0.5000
Epoch 6/200
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.7926 - loss: 0.4288 - val_accuracy: 0.7371 - val_loss: 0.5201
Epoch 7/200
[1m31/31[0m [

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np  # Importar numpy para usar .ravel()

# Define models and parameter grids
models = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20]
        }
    },
    'SVM': {
        'model': SVC(random_state=42),
        'params': {
            'kernel': ['linear', 'rbf'],
            'C': [1, 10, 100]
        }
    }
}

# Function to perform grid search and return best model
def find_best_model(model, params, X_train, y_train, X_test, y_test):
    # Convertir y_train y y_test a arrays 1D usando .ravel()
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=3, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return best_model, best_params, accuracy

# Perform grid search for each model
results = {}
for model_name, config in models.items():
    best_model, best_params, accuracy = find_best_model(config['model'], config['params'], X_train, y_train, X_test, y_test)
    
    results[model_name] = {
        'best_model': best_model,
        'best_params': best_params,
        'accuracy': accuracy
    }
    

# Select the model with the highest accuracy
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['best_model']
best_params = results[best_model_name]['best_params']
best_accuracy = results[best_model_name]['accuracy']

print(f"Best model overall: {best_model_name}")
print(f"Best parameters: {best_params}")
print(f"Accuracy on test set: {best_accuracy:.4f}")


Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best model overall: RandomForest
Best parameters: {'max_depth': None, 'n_estimators': 200}
Accuracy on test set: 0.7318


In [7]:
# Assuming X_train, X_test, y_train, y_test are already prepared and model is trained

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define and train the best model
best_model = RandomForestClassifier(max_depth=None, n_estimators=200, random_state=42)
best_model.fit(X_train, y_train)

# Evaluate on test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Best model selected: RandomForest')
print(f'Best parameters: {{"max_depth": None, "n_estimators": 200}}')
print(f'Accuracy on test set: {accuracy:.4f}')


  return fit_method(estimator, *args, **kwargs)


Best model selected: RandomForest
Best parameters: {"max_depth": None, "n_estimators": 200}
Accuracy on test set: 0.7318


In [9]:
import pandas as pd

# Cargar el conjunto de datos de evaluación
archivo_evaluacion = 'Taller_2_Titulacion_Evaluación.csv'
datos_evaluacion = pd.read_csv(archivo_evaluacion, encoding='latin-1', delimiter=';')

# Guardar los Ids para el archivo de salida
ids_evaluacion = datos_evaluacion['Id']

# Preprocesar datos de evaluación similar al conjunto de entrenamiento
X_evaluacion = datos_evaluacion.drop('Id', axis=1)  # Eliminar columna 'Id'

# Aplicar el preprocesamiento ya definido anteriormente
X_evaluacion_processed = preprocessor.transform(X_evaluacion)

# Realizar predicciones utilizando el mejor modelo seleccionado (RandomForest)
predicciones_evaluacion = best_model.predict(X_evaluacion_processed)

# Convertir predicciones a formato 'SÍ' o 'NO'
predicciones_evaluacion_etiquetas = ['SÍ' if pred == 1 else 'NO' for pred in predicciones_evaluacion]

# Crear DataFrame para el archivo de salida
resultado_df = pd.DataFrame({'Id': ids_evaluacion, 'Predicción': predicciones_evaluacion_etiquetas})

# Guardar resultados en archivo de texto
archivo_salida = 'clasificación_título.txt'
resultado_df.to_csv(archivo_salida, sep=';', index=False)

print(f'Archivo "{archivo_salida}" generado exitosamente.')


Archivo "clasificación_título.txt" generado exitosamente.
