In [6]:
import pandas as pd
# Common imports
import numpy as np
import os
import sklearn
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,\
                            roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### CARGAR LOS DATOS

In [2]:
df = pd.read_csv(r'C:\Users\iauzm\OneDrive\Desktop\DATA SCIENCE\AUZMENDI\PROYECTO-ML\modelos\tabla_pred_2',index_col='Unnamed: 0')
print(df.shape)
df.head()

(1187, 19)


Unnamed: 0,pts,FGM,FG_PCT,FG3M,FG3_PCT,REB,STL,BLK,PLUS_MINUS,INS,OUT,bpi,off,def,pbpi,OVR,DEF,TIER,target
0,-1.0,-1.0,0.014,-1.0,0.012,-16.0,0.0,-3.0,-16.0,8,-5,0.123,0.57,-0.446,0.123,0,3,1,1
1,14.0,2.0,0.054,0.0,-0.047,-5.0,2.0,-10.0,7.0,-8,5,2.395,1.483,0.913,2.395,1,-2,0,0
2,-10.0,-1.0,-0.081,-2.0,-0.092,-5.0,4.0,2.0,-19.0,2,-10,-2.005,-1.804,-0.201,-2.005,-1,-2,0,0
3,33.0,14.0,0.112,-1.0,-0.039,14.0,2.0,2.0,23.0,-1,1,-1.794,-1.973,0.179,-1.794,-2,-5,-1,1
4,-17.0,-4.0,-0.087,-8.0,-0.227,17.0,0.0,3.0,5.0,-1,-1,0.119,1.088,-0.969,0.119,1,8,0,1


### DECISION TREE
Este modelo busca tomar un primer contacto con el dataset, la feature importance, etc. 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1),
                                                    df['target'],
                                                    test_size=0.2,
                                                    random_state=42)

In [12]:
# ENTRENAMIENTO DEL ARBOL
tree_clf = DecisionTreeClassifier(max_depth= 3,
                                  random_state=42)
tree_clf.fit(X_train, y_train)


# COMPROBACIÓN DE LAS FEATURE IMPORTANCES
print('-'*50)
print('\n---FEATURE IMPORTANCES---')
features_names = X_train.columns
feature_importances = tree_clf.feature_importances_
for i in range(len(features_names)):
    if feature_importances[i]>0.001:
        print(features_names[i],round(feature_importances[i],3))
 
#  METRICAS
print('-'*50)
print('\n---MÉTRICAS---')
print('\n---TRAIN---')
preds = tree_clf.predict(X_train)
print("Score del modelo (accuracy):", round(tree_clf.score(X_train, y_train), 3))
print("Accuracy score:", round(accuracy_score(preds, y_train), 3))
print("Recall score:", round(recall_score(preds, y_train), 3))
print("Precision score:", round(precision_score(preds, y_train), 3))
print("F1 score:", round(f1_score(preds, y_train), 3))
print("AUC:", round(roc_auc_score(preds, y_train), 3))
print('\n---TEST---')
preds = tree_clf.predict(X_test)
print("Score del modelo (accuracy):", round(tree_clf.score(X_test, y_test), 3))
print("Accuracy score:", round(accuracy_score(preds, y_test), 3))
print("Recall score:", round(recall_score(preds, y_test), 3))
print("Precision score:", round(precision_score(preds, y_test), 3))
print("F1 score:", round(f1_score(preds, y_test), 3))
print("AUC:", round(roc_auc_score(preds, y_test), 3))

--------------------------------------------------

---FEATURE IMPORTANCES---
FG_PCT 0.056
STL 0.052
PLUS_MINUS 0.026
bpi 0.193
off 0.567
def 0.035
TIER 0.072
--------------------------------------------------

---MÉTRICAS---

---TRAIN---
Score del modelo (accuracy): 0.682
Accuracy score: 0.682
Recall score: 0.662
Precision score: 0.887
F1 score: 0.758
AUC: 0.703

---TEST---
Score del modelo (accuracy): 0.622
Accuracy score: 0.622
Recall score: 0.589
Precision score: 0.869
F1 score: 0.702
AUC: 0.657


### VALORACIÓN DE LOS RESULTADOS
Algo ha mejorado las métricas. Sigue siendo un simple decision tree, sin haber trabajado las features...

Voy a empezar a hacer el estudio completo en modelo_3.ipynb

  
