# Algunas buenas prácticas

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Generar/usar una base de datos

Usaremos base de datos [Titanic](https://www.kaggle.com/c/titanic/data)

La base de datos que utilizarán cuenta con 9 variables independientes y una variable respuesta. Las variables son:

- *survival* : 0 si no sobrevivió, 1 si sobrevivió (variable respuesta).

- *pclass*: clase del ticket. 1 = primera clase, 2 = segunda clase, 3 = tercera clase.

- *sex*: Sexo del pasajero. Male = Masculino, Female = Femenino.

- *Age*: Edad en años del pasajero. 

- *sibsp*: Número de familiares (hermanos, pareja) en el Titanic

- *parch*: Número de padres o hijos en el Titanic.

- *ticket*: Número de ticket.

- *fare*: El costo del pasaje.

- *cabin*: El número de cabina.

- *Embarked*: Puerto donde embarcó. C = Cherbourg, Q = Queenstown, S = Southampton

In [None]:
import pandas as pd
train_data = pd.read_csv('demo_12_dataset/train.csv')
test_data = pd.read_csv('demo_12_dataset/test.csv')

## Análisis exploratorio de la base de datos

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.describe(include=['O'])

In [None]:
train_data.isna().sum()

Podemos hacer gráficos exploratorios para entender la base de datos

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.countplot(x=train_data['Sex'],hue=train_data['Survived'],palette=('red','green'))

plt.subplot(1,2,2)
sns.countplot(x=train_data['Pclass'],hue=train_data['Survived'],palette=('red','green'))
plt.show()

In [None]:
plt.figure(figsize=(15,5))

var = 'Pclass'
plt.subplot(1,2,1)
plt.title('Died')
plt.pie(train_data.groupby('Survived')[var].value_counts()[0],
        labels=train_data.groupby('Survived')[var].value_counts()[0].index,
        autopct='%1.1f%%', colors=['green','blue','red'])

plt.subplot(1,2,2)
plt.title('Survived')
plt.pie(train_data.groupby('Survived')[var].value_counts()[1],
        labels=train_data.groupby('Survived')[var].value_counts()[1].index,
        autopct='%1.1f%%', colors=['red','green','blue'])
plt.show()

Podemos hacer tablas para entender un poco más.

Calculemos el valor medio de la variable supervivencia en función de la clase

In [None]:
train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Analicemos la supervivencia en función del sexo

In [None]:
train_data[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

O en función de la edad

In [None]:
g = sns.FacetGrid(train_data, col='Survived')
g.map(plt.hist, 'Age', bins=30)
plt.show()

## Selección de variables o features

Podemos tirar las variables PassengerId y Ticket. Cabin está muy incompleta así que la podemos tirar también.

In [None]:
train_data.columns

In [None]:
PassengerID_val = test_data.PassengerId

In [None]:
drop_vars = ['PassengerId', 'Ticket', 'Cabin']

train_data = train_data.drop(drop_vars, axis=1)
test_data = test_data.drop(drop_vars, axis=1)


Qué información hay en la variable Name?

In [None]:
for dataset in (train_data, test_data):
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_data['Title'], train_data['Sex'])

In [None]:
for dataset in (train_data, test_data):
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
for dataset in (train_data, test_data):
    dataset.drop('Name', axis=1, inplace=True)

## Imputación de datos faltantes

In [None]:
from sklearn.impute import SimpleImputer

- [Documentación](https://scikit-learn.org/stable/modules/impute.html) de sklearn

In [None]:
train_data.isna().sum()

In [None]:
test_data.isna().sum()

In [None]:
for dataset in (train_data, test_data):
    dataset['Embarked'] = pd.Series(list(SimpleImputer(strategy="constant", fill_value="S").fit_transform(np.array(dataset['Embarked']).reshape(-1, 1))))
    dataset['Age'] = SimpleImputer(strategy="median").fit_transform(np.array(dataset['Age']).reshape(-1, 1))
    

In [None]:
train_data.isna().sum()

## Transformación de variables

In [None]:
from sklearn.preprocessing import LabelEncoder

La variables categóricas deber transformarse a numéricas

In [None]:
train_data

In [None]:
LabelEncoder?

In [None]:
Te = LabelEncoder()
for dataset in (train_data, test_data):
    dataset['Title'] = Te.fit_transform(dataset['Title'])

In [None]:
Te.inverse_transform([0,1,2,3,4])

In [None]:
se = LabelEncoder()
for dataset in (train_data, test_data):
    dataset['Sex'] = se.fit_transform(dataset['Sex'])

In [None]:
se.classes_

In [None]:
Eenc = LabelEncoder()
for dataset in (train_data, test_data):
    dataset['Embarked'] = Eenc.fit_transform(list(dataset['Embarked']))
Eenc.classes_

In [None]:
train_data

Bandas para Age y Fare

In [None]:
for dataset in (train_data, test_data):
    dataset['AgeBand'] = pd.cut(dataset['Age'], 5)
    dataset.drop('Age', axis=1, inplace=True)
train_data[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

In [None]:
Aenc = LabelEncoder()
for dataset in (train_data, test_data):
    dataset['AgeBand'] = Aenc.fit_transform(dataset['AgeBand'])
Aenc.classes_

In [None]:
for dataset in (train_data, test_data):
    dataset['FareBand'] = pd.cut(dataset['Fare'], 4)
    dataset.drop('Fare', axis=1, inplace=True)
train_data[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

In [None]:
Fenc = LabelEncoder()
for dataset in (train_data, test_data):
    dataset['FareBand'] = Fenc.fit_transform(dataset['FareBand'])
Fenc.classes_

## Train test split

- Train Dataset: Used to fit the machine learning model.

- Test Dataset: Used to evaluate the fit machine learning model.

- Stratified* Importante chequear en datos desbalanceados.

In [None]:
X = train_data.drop('Survived',axis=1)
y = train_data.Survived

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

## Comparación de Modelos

- [Documentación](https://scikit-learn.org/stable/model_selection.html) de sklearn acerca de selección de modelo.

- [Documentación](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter) de sklear acerca de métricas de bondad de modelo

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate, KFold, StratifiedKFold, LeaveOneOut, LeavePOut
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, roc_auc_score, mean_squared_error, r2_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC


clfs =  [DecisionTreeClassifier(),
        RandomForestClassifier(),
        LogisticRegression(),
        MLPClassifier(),
        XGBClassifier(),
        GaussianNB(),
        LinearSVC()]

names = ['Arbol de decisión',
        'Random Forest',
        'Regresión Logística',
        'Perceptrón multicapa',
        'XGBoost',
        'Naive Bayes',
        'SVM']

trained_models = []
accuracy_models = []
for clf, name in zip(clfs, names):
    print(name)
    clf.fit(x_train, y_train)
    train_predictions = clf.predict(x_train)
    accuracy = accuracy_score(y_train, train_predictions)
    print(f"Accuracy train {name}: %.2f%%" % (accuracy * 100.0))

    test_predictions = clf.predict(x_test)
    accuracy = accuracy_score(y_test, test_predictions)
    print(f"Accuracy test {name}: %.2f%%" % (accuracy * 100.0))
    trained_models.append(clf)    
    accuracy_models.append(accuracy)


In [None]:
models = pd.DataFrame({'Model':names, 'Score':accuracy_models})
models.sort_values(by='Score', ascending=False)

## Selección de hiperparámetros

In [None]:
from sklearn.model_selection import GridSearchCV

parameter_grid = {'hidden_layer_sizes':(100, (10,10)),
                'activation': ('relu','tanh'),
                'solver': ('sgd', 'adam'),
                'alpha': np.logspace(-5,-2,3),
                'learning_rate': ('constant', 'adaptive')}

grid = GridSearchCV(MLPClassifier(max_iter=300), parameter_grid)
grid.fit(x_train, y_train)

In [None]:
grid.best_params_

Comparación con RandomForest

In [None]:
parameter_grid2 = {'n_estimators':(10,100,200),
                'criterion': ('gini', 'entropy', 'log_loss'),
                'max_depth': (None, 10, 20),
                'max_features': ('sqrt', 'log2')}

grid2 = GridSearchCV(RandomForestClassifier(n_jobs=2), parameter_grid2)
grid2.fit(x_train, y_train)

In [None]:
grid2.best_params_

In [None]:
FOLDS=5
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=10)


for clfi in [MLPClassifier(**grid.best_params_, max_iter=300), RandomForestClassifier(**grid2.best_params_, n_jobs=2)]:
    print(clfi)
    avg_accuracy = 0
    for fold, (train_idx, val_idx) in enumerate(cv.split(x_train, y_train)):
        xi, yi = np.array(x_train)[train_idx], np.array(y_train)[train_idx]
        x_valid, y_valid = np.array(x_train)[val_idx], np.array(y_train)[val_idx]
        clfi = clfi.fit(xi, yi)

        test_predictions = clfi.predict(x_valid)
        accuracy = accuracy_score(y_valid, test_predictions)
        avg_accuracy +=accuracy
        print(f"accuracy test fold {fold}: {accuracy * 100.0 :.2f}" % ())
    avg_accuracy /= FOLDS
    print(f'Avg. accuracy = {avg_accuracy * 100}')

## Selección de modelo final y análisis de resultados

In [None]:
clf = MLPClassifier(**grid.best_params_, max_iter=400)
clf = clf.fit(x_train, y_train)
test_predictions = clf.predict(x_test)
accuracy = accuracy_score(y_test, test_predictions)
accuracy*100

In [None]:
print(classification_report(y_test, test_predictions))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve

ConfusionMatrixDisplay.from_predictions(y_test, test_predictions, display_labels=(1,0))
plt.show()

fpr, tpr, thresholds = roc_curve(y_test,clf.predict_proba(x_test)[:,1])
plt.plot(fpr, tpr, label='ROC curve')
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.xlim([-0.02, 1])
plt.ylim([0, 1.02])
plt.legend(loc="lower right")

In [None]:
roc_auc_score(y_test, test_predictions)

### Entrega

In [None]:
submission = pd.DataFrame({
        "PassengerId": PassengerID_val,
        "Survived": clf.predict(test_data)
    })
#submission.to_csv('submission.csv', index=False)
submission

Ejemplo basado en este [link](https://www.kaggle.com/code/startupsci/titanic-data-science-solutions)

[Doc](https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py) sobre curvas Receiver Operating Characteristic (ROC)