<a href="https://colab.research.google.com/github/DajeanArcila/biblioteca_pandas/blob/main/KNN_Biomedica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [83]:

# Fijar la semilla para reproducibilidad
np.random.seed(0)

# Crear datos simulados
gene_ids = ['Gene_' + str(i) for i in range(1, 301)]
expression_levels = np.random.randint(1, 100, 300)
tissue_types = np.random.choice(['Liver', 'Brain', 'Heart', 'Kidney'], 300)
treatment_conditions = np.random.choice(['Treated', 'Untreated'], 300)
disease_statuses = np.random.choice(['Healthy', 'Diseased'], 300)

# Crear el DataFrame
data = {
    'Gene_ID': gene_ids,
    'Expression_Level': expression_levels,
    'Tissue_Type': tissue_types,
    'Treatment_Condition': treatment_conditions,
    'Disease_Status': disease_statuses
}

df = pd.DataFrame(data)

# Guardar en un archivo CSV
df.to_csv('bioinformatics_dataset.csv', index=False)

print("Dataset creado y guardado como 'bioinformatics_dataset.csv'")


Dataset creado y guardado como 'bioinformatics_dataset.csv'


In [84]:
df.head()

Unnamed: 0,Gene_ID,Expression_Level,Tissue_Type,Treatment_Condition,Disease_Status
0,Gene_1,45,Brain,Untreated,Healthy
1,Gene_2,48,Liver,Treated,Diseased
2,Gene_3,65,Liver,Untreated,Healthy
3,Gene_4,68,Brain,Treated,Diseased
4,Gene_5,68,Brain,Untreated,Diseased


In [86]:

le_tissue = LabelEncoder()
df['Tissue_Type'] = le_tissue.fit_transform(df['Tissue_Type'])

le_treatment = LabelEncoder()
df['Treatment_Condition'] = le_treatment.fit_transform(df['Treatment_Condition'])

le_disease = LabelEncoder()
df['Disease_Status'] = le_disease.fit_transform(df['Disease_Status'])

'''
# Normalizar los datos
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
'''
'''

# Normalizar los datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
'''

'\n\n# Normalizar los datos\nscaler = StandardScaler()\nX_train = scaler.fit_transform(X_train)\nX_test = scaler.transform(X_test)\n'

In [87]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X = df.drop(['Gene_ID','Disease_Status'], axis=1)
y = df['Disease_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [89]:
# Evaluar el modelo
accuracy = knn.score(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.50


# ejemplo de otra forma

aqui usamos otro ejemplo combinando el label encoder con minmaxscaler


In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import numpy as np

# Simulamos un dataset de clientes
np.random.seed(0)
df = pd.DataFrame({
    'customer_id': np.arange(1000),
    'age': np.random.randint(18, 70, size=1000),
    'tenure': np.random.randint(0, 10, size=1000),
    'balance': np.random.uniform(0, 10000, size=1000),
    'products_number': np.random.randint(1, 4, size=1000),
    'credit_score': np.random.randint(300, 850, size=1000),
    'is_active': np.random.choice([0, 1], size=1000),
    'estimated_salary': np.random.uniform(20000, 100000, size=1000),
    'churn': np.random.choice([0, 1], size=1000)
})

# Convertimos alguna columna a categórica como ejemplo
df['is_active'] = df['is_active'].astype(str)

# Mostrar las primeras filas del dataset
print(df.head())



   customer_id  age  tenure      balance  products_number  credit_score  \
0            0   62       6  1560.982755                2           772   
1            1   65       8  6514.670608                2           810   
2            2   18       8  3370.810621                2           360   
3            3   21       6  4302.354780                2           822   
4            4   21       3  8493.363351                1           556   

  is_active  estimated_salary  churn  
0         1      22172.750700      0  
1         1      43019.754032      1  
2         1      34870.217291      0  
3         0      48467.944544      0  
4         0      42727.671432      1  


In [91]:

# Separar las características (X) y la variable objetivo (y)
X = df.drop(['customer_id', 'churn'], axis=1)
y = df['churn']

# Codificar la columna categórica
label_encoder = LabelEncoder()
X['is_active'] = label_encoder.fit_transform(X['is_active'])

# Dividir los datos en conjunto de entrenamiento y conjunto de prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Función para evaluar el modelo
def evaluate_model(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=7)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    print(classification_report(y_test, y_pred))

print("Sin normalización:")
evaluate_model(X_train, X_test, y_train, y_test)


Sin normalización:
Accuracy: 0.48
              precision    recall  f1-score   support

           0       0.40      0.52      0.45        82
           1       0.58      0.45      0.50       118

    accuracy                           0.48       200
   macro avg       0.49      0.49      0.48       200
weighted avg       0.50      0.48      0.48       200



In [92]:

# Normalizar los datos con StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nCon StandardScaler:")
evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test)

# Normalizar los datos con MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nCon MinMaxScaler:")
evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test)


Con StandardScaler:
Accuracy: 0.45
              precision    recall  f1-score   support

           0       0.36      0.44      0.40        82
           1       0.54      0.46      0.50       118

    accuracy                           0.45       200
   macro avg       0.45      0.45      0.45       200
weighted avg       0.47      0.45      0.45       200


Con MinMaxScaler:
Accuracy: 0.43
              precision    recall  f1-score   support

           0       0.35      0.46      0.40        82
           1       0.52      0.41      0.46       118

    accuracy                           0.43       200
   macro avg       0.44      0.44      0.43       200
weighted avg       0.45      0.43      0.43       200

