# Scripts del proyecto

## Script 1: Preparacion de datos para el entrenamiento y Validacion

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/raw/drug200.xls")

In [3]:
def label_encoder(datos_categoria):
    le = LabelEncoder()
    df[datos_categoria]=le.fit_transform(df[datos_categoria])

In [4]:
variables = ["Sex","BP","Cholesterol","Na_to_K","Drug"]

In [5]:
for i in variables:
    label_encoder(i)
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,167,0
1,47,1,1,0,89,3
2,47,1,1,0,43,3
3,28,0,2,0,10,4
4,61,0,1,0,133,0


In [6]:
x = df.drop("Drug",axis=1)
y = df.Drug
x.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,23,0,0,0,167
1,47,1,1,0,89
2,47,1,1,0,43
3,28,0,2,0,10
4,61,0,1,0,133


In [7]:
x_train,x_test,y_train,y_test = train_test_split(x ,y ,test_size= 0.2,random_state = 42,shuffle = True)

In [8]:
x_train.to_csv("../data/processed/drug_x_train.csv",index=False)
x_test.to_csv("../data/processed/drug_x_test.csv",index=False)
y_train.to_csv("../data/processed/drug_y_train.csv",index=False)
y_test.to_csv("../data/processed/drug_y_test.csv",index=False)

## Script 2: Código de Entrenamiento

In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import pickle

In [10]:
x_train = pd.read_csv("../data/processed/drug_x_train.csv")
y_train = pd.read_csv("../data/processed/drug_y_train.csv").squeeze()

In [11]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(x_train,y_train)

In [12]:
# Guardamos el modelo entrenado para usarlo en produccion
filename = '../models/best_model.pkl'
pickle.dump(rfc, open(filename, 'wb'))

## Script 3: Código de Validación

In [29]:
import pandas as pd
import pickle
from sklearn.metrics import *

In [14]:
# Cargar la tabla transformada
x_test = pd.read_csv("../data/processed/drug_x_test.csv")
y_test = pd.read_csv("../data/processed/drug_y_test.csv")
#y_test = np.ravel(y_test)

In [15]:
# Leemos el modelo entrenado!
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [16]:
y_test_pred = model.predict(x_test)
y_test = np.ravel(y_test_pred)
conf_mz_test = confusion_matrix(y_test,y_test_pred)
conf_mz_test

array([[15,  0,  0,  0,  0],
       [ 0,  6,  0,  0,  0],
       [ 0,  0,  3,  0,  0],
       [ 0,  0,  0,  5,  0],
       [ 0,  0,  0,  0, 11]], dtype=int64)

In [17]:
y_test_pred

array([4, 0, 4, 3, 0, 0, 0, 4, 1, 4, 1, 4, 0, 1, 2, 0, 2, 4, 3, 0, 2, 4,
       4, 0, 0, 0, 3, 4, 0, 4, 0, 3, 3, 0, 1, 0, 4, 1, 0, 1], dtype=int64)

In [18]:
y_test

array([4, 0, 4, 3, 0, 0, 0, 4, 1, 4, 1, 4, 0, 1, 2, 0, 2, 4, 3, 0, 2, 4,
       4, 0, 0, 0, 3, 4, 0, 4, 0, 3, 3, 0, 1, 0, 4, 1, 0, 1], dtype=int64)

In [24]:
## Metricas de validación
def calc_metrics(y_test,y_pred_test):
    cm_test = confusion_matrix(y_test,y_pred_test)
    print("Matriz de confusion: ")
    print(cm_test)
    accuracy_test=accuracy_score(y_test,y_pred_test)
    print("Accuracy: ", accuracy_test)

In [20]:
def save_plot(title):
    plt.title(title)
    fig = plt.gcf()
    filename = title.replace(" ", "_").lower()
    fig.savefig('{}'.format(filename), dpi=500)
    plt.clf()

In [25]:
calc_metrics(y_test,y_test_pred)

Matriz de confusion: 
[[15  0  0  0  0]
 [ 0  6  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  5  0]
 [ 0  0  0  0 11]]
Accuracy:  1.0


In [34]:
scores=model.predict(x_test).reshape(-1,1)

In [38]:
df_score = pd.DataFrame(scores, columns=['PREDICT'])

In [41]:
df_score.to_csv('../data/scores/final_score.csv', index = False)

In [42]:
scores

array([[4],
       [0],
       [4],
       [3],
       [0],
       [0],
       [0],
       [4],
       [1],
       [4],
       [1],
       [4],
       [0],
       [1],
       [2],
       [0],
       [2],
       [4],
       [3],
       [0],
       [2],
       [4],
       [4],
       [0],
       [0],
       [0],
       [3],
       [4],
       [0],
       [4],
       [0],
       [3],
       [3],
       [0],
       [1],
       [0],
       [4],
       [1],
       [0],
       [1]], dtype=int64)