# Challenge 6

## 1. Introduccion

Para superar la limitación de hierro, los microorganismos han desarrollado diversos mecanismos de adquisición, uno de los cuales es la producción de sideróforos. Los sideróforos son moléculas pequeñas de naturaleza quelante que tienen la capacidad de unirse al hierro y transportarlo al interior de la célula.

### Problematica

El hierro es un elemento esencial para la vida, pero su disponibilidad en la naturaleza es limitada. Los microorganismos han desarrollado mecanismos para adquirir hierro del entorno, uno de los cuales es la producción de sideróforos. Los sideróforos son moléculas pequeñas que quelan el hierro y lo transportan al interior de la célula.

La base de datos de sideróforos proporciona información valiosa sobre estos compuestos, incluyendo su estructura, propiedades y microorganismos asociados. Esta información puede ser utilizada para:

- Comprender mejor los mecanismos de adquisición de hierro en microorganismos.
- Desarrollar nuevas estrategias para combatir las enfermedades infecciosas.
- Diseñar sideróforos artificiales para aplicaciones biotecnológicas.


## 2. Metodologia

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Importamos los datos
df_db = pd.read_csv("https://raw.githubusercontent.com/inefable12/siderophores_database/main/Siderophore_DB.csv")
df_db1 = pd.read_csv("https://raw.githubusercontent.com/inefable12/siderophores_database/main/Siderophore_DB1.csv")

# Selecconamos solo los grupos que queremos
df_db_grupos = df_db[['hydroxamate','catecholate','a-hydroxycarboxylate','carboxylate','phenolate','citrate','other']]

# Transponemos el df
df_db_grupos_transpuesto = df_db_grupos.transpose()
df_db_grupos_transpuesto.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,211,212,213,214,215,216,217,218,219,220
hydroxamate,0,0,0,0,0,3,3,3,0,0,...,3,3,0,0,3,2,0,0,0,0
catecholate,1,1,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,1
a-hydroxycarboxylate,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
carboxylate,1,1,1,1,1,0,0,0,1,1,...,0,0,0,1,0,0,2,2,2,2
phenolate,0,0,0,0,1,0,0,0,0,1,...,0,0,0,2,0,0,0,0,0,0


In [None]:
# Seleccionamos los compuestos que pertenecen a un solo grupo del df_db y lo clasificamos
columnas = []
for i in range(220):
  if df_db_grupos_transpuesto[i].eq(0).sum() == 6:
    columnas.append(i)

df_db_grupos_un_valor = df_db_grupos_transpuesto[columnas].transpose().idxmax(axis=1)
df_db_grupos_un_valor

5      hydroxamate
6      hydroxamate
7      hydroxamate
11     catecholate
13     hydroxamate
          ...     
209    hydroxamate
211    hydroxamate
212    hydroxamate
215    hydroxamate
216    hydroxamate
Length: 112, dtype: object

In [None]:
# Igualmente seleccionamos los compuestos en el df_db1 y eliminamos la columna ID
df_db1_transpuesto = df_db1.transpose()
df_db1_grupos_un_valor = df_db1_transpuesto[columnas].transpose().drop(columns=['ID'])
df_db1_grupos_un_valor.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,MaxPartialCharge,...,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_amide,fr_benzene,fr_ester,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond
5,14.637926,-6.027201,14.637926,1.541309,0.030202,782.889,724.425,782.406186,310,0.328347,...,4,2,0,6,6,0,1,0,0,0
6,14.204271,-5.464974,14.204271,1.515272,0.02944,726.825,672.393,726.379971,288,0.322416,...,3,2,1,6,5,0,1,0,0,0
7,14.235765,-5.588196,14.235765,1.524537,0.028291,740.852,684.404,740.395621,294,0.322629,...,3,3,0,6,5,0,1,0,0,0
11,13.582822,-4.208434,13.582822,0.133729,0.577591,346.343,328.199,346.12772,132,0.293318,...,2,2,0,2,2,1,0,1,2,2
13,13.840994,-5.437481,13.840994,1.185498,0.052742,636.791,584.375,636.384663,254,0.249829,...,3,2,1,6,5,1,0,0,0,0


In [None]:
# Juntamos los dos df
df_db_db1 = pd.concat([df_db_grupos_un_valor, df_db1_grupos_un_valor], axis=1).rename(columns={0:'target'})
df_db_db1.head()

Unnamed: 0,target,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_amide,fr_benzene,fr_ester,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond
5,hydroxamate,14.637926,-6.027201,14.637926,1.541309,0.030202,782.889,724.425,782.406186,310,...,4,2,0,6,6,0,1,0,0,0
6,hydroxamate,14.204271,-5.464974,14.204271,1.515272,0.02944,726.825,672.393,726.379971,288,...,3,2,1,6,5,0,1,0,0,0
7,hydroxamate,14.235765,-5.588196,14.235765,1.524537,0.028291,740.852,684.404,740.395621,294,...,3,3,0,6,5,0,1,0,0,0
11,catecholate,13.582822,-4.208434,13.582822,0.133729,0.577591,346.343,328.199,346.12772,132,...,2,2,0,2,2,1,0,1,2,2
13,hydroxamate,13.840994,-5.437481,13.840994,1.185498,0.052742,636.791,584.375,636.384663,254,...,3,2,1,6,5,1,0,0,0,0


### División entrenamiento-prueba

In [None]:
# Definimos un dataframe con solo características
df_feat = df_db_db1.drop('target',axis=1)
df_feat.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,MaxPartialCharge,...,fr_NH0,fr_NH1,fr_NH2,fr_N_O,fr_amide,fr_benzene,fr_ester,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond
5,14.637926,-6.027201,14.637926,1.541309,0.030202,782.889,724.425,782.406186,310,0.328347,...,4,2,0,6,6,0,1,0,0,0
6,14.204271,-5.464974,14.204271,1.515272,0.02944,726.825,672.393,726.379971,288,0.322416,...,3,2,1,6,5,0,1,0,0,0
7,14.235765,-5.588196,14.235765,1.524537,0.028291,740.852,684.404,740.395621,294,0.322629,...,3,3,0,6,5,0,1,0,0,0
11,13.582822,-4.208434,13.582822,0.133729,0.577591,346.343,328.199,346.12772,132,0.293318,...,2,2,0,2,2,1,0,1,2,2
13,13.840994,-5.437481,13.840994,1.185498,0.052742,636.791,584.375,636.384663,254,0.249829,...,3,2,1,6,5,1,0,0,0,0


In [None]:
# Definimos un dataframe con solo la variable objetivo
df_target = df_db_db1['target']
df_target.head()

5     hydroxamate
6     hydroxamate
7     hydroxamate
11    catecholate
13    hydroxamate
Name: target, dtype: object

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_feat, df_target, test_size=0.30, random_state=101)

In [None]:
y_train.head()

207    catecholate
124    hydroxamate
7      hydroxamate
205    catecholate
203    hydroxamate
Name: target, dtype: object

### Entrenamos el clasificador de Soporte Vectorial

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC()

In [None]:
model.fit(X_train,y_train)

## 3. Resultados

### Predicciones y Evaluaciones

In [None]:
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(confusion_matrix(y_test,predictions))

[[ 0  7  0]
 [ 0 26  0]
 [ 0  1  0]]


In [None]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

 catecholate       0.00      0.00      0.00         7
 hydroxamate       0.76      1.00      0.87        26
   phenolate       0.00      0.00      0.00         1

    accuracy                           0.76        34
   macro avg       0.25      0.33      0.29        34
weighted avg       0.58      0.76      0.66        34



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
hyperparameters = model.get_params()
print(hyperparameters)

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Definición de la función de evaluación
def evaluate_model(X_train, y_train, X_test, y_test, C, kernel, max_iter):
    # Entrenamiento del modelo con los valores de hiperparámetros actuales
    model = SVC(C=C, kernel=kernel, max_iter=max_iter)
    model.fit(X_train, y_train)

    # Evaluación del modelo en el conjunto de prueba
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy


# Definición del rango de valores para los hiperparámetros
C_values = np.logspace(-1, 3, 5)
kernel_values = ['linear', 'rbf', 'poly']
max_iter_values = [100, 200, 500, 1000, 2000]

# Bucle de experimentación
best_accuracy = 0
best_C = None
best_kernel = None
best_max_iter = None
for C in C_values:
    for kernel in kernel_values:
        for max_iter in max_iter_values:
            accuracy = evaluate_model(X_train, y_train, X_test, y_test, C, kernel, max_iter)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_C = C
                best_kernel = kernel
                best_max_iter = max_iter

# Análisis de los resultados
print("Mejor precisión:", best_accuracy)
print("Mejor valor de C:", best_C)
print("Mejor kernel:", best_kernel)
print("Mejor max_iter:", best_max_iter)



Mejor precisión: 0.8823529411764706
Mejor valor de C: 0.1
Mejor kernel: linear
Mejor max_iter: 500
