In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_reports = pd.read_pickle('data/preprocessed_reports.pkl')
df_report_pairs = pd.read_pickle('data/preprocessed_report_pairs.pkl')

tfidf1L = pickle.load(open('data/vectorizer/tfidf1L.pkl', 'rb'))
tfidf2L = pickle.load(open('data/vectorizer/tfidf2L.pkl', 'rb'))
tfidf3L = pickle.load(open('data/vectorizer/tfidf3L.pkl', 'rb'))
tfidf4L = pickle.load(open('data/vectorizer/tfidf4L.pkl', 'rb'))
tfidf1S = pickle.load(open('data/vectorizer/tfidf1S.pkl', 'rb'))
tfidf2S = pickle.load(open('data/vectorizer/tfidf2S.pkl', 'rb'))
tfidf3S = pickle.load(open('data/vectorizer/tfidf3S.pkl', 'rb'))
tfidf4S = pickle.load(open('data/vectorizer/tfidf4S.pkl', 'rb'))

In [3]:
import os

if not os.path.exists('data/svm/'): os.makedirs('data/svm/') 
if not os.path.exists('data/rfc/'): os.makedirs('data/rfc/') 

# Preparación del dataset

In [4]:
df_report_pairs.columns

Index(['type', 'id_M', 'id_D', 'statusM', 'statusD', 'componentM',
       'componentD', 'summaryM', 'summaryD', 'commentsM', 'commentsD',
       'text1M', 'text1D', 'text2M', 'text2D', 'text3M', 'text3D', 'text4M',
       'text4D', 'tokens1ML', 'tokens2ML', 'tokens3ML', 'tokens4ML',
       'tokens1DL', 'tokens2DL', 'tokens3DL', 'tokens4DL', 'tokens1MS',
       'tokens2MS', 'tokens3MS', 'tokens4MS', 'tokens1DS', 'tokens2DS',
       'tokens3DS', 'tokens4DS'],
      dtype='object')

In [5]:
# A cada reporte M y D de cada par se aplicaría:
# textP = textM + textD
# tokensP = tokenizer(textP)
# Para ahorrar tiempo se usarán los tokens ya obtenidos

df_report_pairs['text1P'] = df_report_pairs['text1M'] + df_report_pairs['text1D']
df_report_pairs['text2P'] = df_report_pairs['text2M'] + df_report_pairs['text2D']
df_report_pairs['text3P'] = df_report_pairs['text3M'] + df_report_pairs['text3D']
df_report_pairs['text4P'] = df_report_pairs['text4M'] + df_report_pairs['text4D']

In [6]:
df_report_pairs['tokens1PL'] = df_report_pairs['tokens1ML'] + df_report_pairs['tokens1DL']
df_report_pairs['tokens2PL'] = df_report_pairs['tokens2ML'] + df_report_pairs['tokens2DL']
df_report_pairs['tokens3PL'] = df_report_pairs['tokens3ML'] + df_report_pairs['tokens3DL']
df_report_pairs['tokens4PL'] = df_report_pairs['tokens4ML'] + df_report_pairs['tokens4DL']
df_report_pairs['tokens1PS'] = df_report_pairs['tokens1MS'] + df_report_pairs['tokens1DS']
df_report_pairs['tokens2PS'] = df_report_pairs['tokens2MS'] + df_report_pairs['tokens2DS']
df_report_pairs['tokens3PS'] = df_report_pairs['tokens3MS'] + df_report_pairs['tokens3DS']
df_report_pairs['tokens4PS'] = df_report_pairs['tokens4MS'] + df_report_pairs['tokens4DS']

In [7]:
data1L = tfidf1L.transform(df_report_pairs['tokens1PL'].apply(' '.join)).toarray()
data2L = tfidf2L.transform(df_report_pairs['tokens2PL'].apply(' '.join)).toarray()
data3L = tfidf3L.transform(df_report_pairs['tokens3PL'].apply(' '.join)).toarray()
data4L = tfidf4L.transform(df_report_pairs['tokens4PL'].apply(' '.join)).toarray()
data1S = tfidf1S.transform(df_report_pairs['tokens1PS'].apply(' '.join)).toarray()
data2S = tfidf2S.transform(df_report_pairs['tokens2PS'].apply(' '.join)).toarray()
data3S = tfidf3S.transform(df_report_pairs['tokens3PS'].apply(' '.join)).toarray()
data4S = tfidf4S.transform(df_report_pairs['tokens4PS'].apply(' '.join)).toarray()

In [8]:
print(
    data1L.shape,  data2L.shape, data3L.shape, data4L.shape, '\n',
    data1S.shape,  data2S.shape, data3S.shape, data4S.shape
)

(5000, 5189) (5000, 5216) (5000, 11703) (5000, 6182) 
 (5000, 4266) (5000, 4286) (5000, 9599) (5000, 5012)


In [9]:
df_report_pairs['duplicate'] = df_report_pairs['type'].apply(lambda x: 0 if x=='master' else 1)
df_report_pairs['duplicate'].value_counts()

0    4000
1    1000
Name: duplicate, dtype: int64

# Preparación de los conjuntos de entrenamiento y test

In [10]:
from sklearn.model_selection import train_test_split

y = df_report_pairs['duplicate'].values

x1L_train, x1L_test, y_train, y_test = train_test_split(data1L, y, test_size=0.2, random_state=42)
x2L_train, x2L_test, y_train, y_test = train_test_split(data2L, y, test_size=0.2, random_state=42)
x3L_train, x3L_test, y_train, y_test = train_test_split(data3L, y, test_size=0.2, random_state=42)
x4L_train, x4L_test, y_train, y_test = train_test_split(data4L, y, test_size=0.2, random_state=42)
x1S_train, x1S_test, y_train, y_test = train_test_split(data1S, y, test_size=0.2, random_state=42)
x2S_train, x2S_test, y_train, y_test = train_test_split(data2S, y, test_size=0.2, random_state=42)
x3S_train, x3S_test, y_train, y_test = train_test_split(data3S, y, test_size=0.2, random_state=42)
x4S_train, x4S_test, y_train, y_test = train_test_split(data4S, y, test_size=0.2, random_state=42)

In [11]:
x1L_train.shape, x1L_test.shape, y_train.shape, y_test.shape

((4000, 5189), (1000, 5189), (4000,), (1000,))

In [12]:
type(x1L_train), type(x1L_test), type(y_train), type(y_test)

(numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)

In [13]:
type(x1L_train[0]), type(x1L_test[0]), type(y_train[0]), type(y_test[0])

(numpy.ndarray, numpy.ndarray, numpy.int64, numpy.int64)

# Máquinas de soporte vectorial

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


def fit_svm(x_train, y_train, x_test, y_test) -> dict:
    """
    Fits a svm classifier using x_train and y_train
    Calculates classification metrics and returns it with the model
    No hyperparameters are tuned
    :returns: dict with keys
        - model: SVClassifier
        - accuracy: float
        - confusion_matrix: np.array
        - classification_report: str
    """
    svm = SVC(kernel='linear', C=1.0, random_state=42)
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    return {
        'model': svm,
        'accuracy': accuracy_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, zero_division=0)
    }

In [15]:
def show_model_results(title: str, results: dict):
    print("-------------------------------------------------------------------")
    print(f"Resultados del entrenamiento: {title}")
    print("-------------------------------------------------------------------")
    print(f"Accuracy: {results['accuracy']}")
    print("-------------------------------------------------------------------")
    print("Matriz de Confusión")
    print("-------------------------------")
    print(results['confusion_matrix'])
    print("-------------------------------------------------------------------")
    print("Reporte de Clasificación")
    print("-------------------------------")
    print(results['classification_report'])
    print("-------------------------------------------------------------------\n")

In [16]:
svm1L_results = fit_svm(x1L_train, y_train, x1L_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 1L",svm1L_results)
pickle.dump(svm1L_results, open('data/svm/svm1L_results.pkl', 'wb'))

svm2L_results = fit_svm(x2L_train, y_train, x2L_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 2L",svm2L_results)
pickle.dump(svm2L_results, open('data/svm/svm2L_results.pkl', 'wb'))

svm3L_results = fit_svm(x3L_train, y_train, x3L_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 3L",svm3L_results)
pickle.dump(svm3L_results, open('data/svm/svm3L_results.pkl', 'wb'))

svm4L_results = fit_svm(x4L_train, y_train, x4L_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 4L",svm4L_results)
pickle.dump(svm4L_results, open('data/svm/svm4L_results.pkl', 'wb'))

svm1S_results = fit_svm(x1S_train, y_train, x1S_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 1S",svm1S_results)
pickle.dump(svm1S_results, open('data/svm/svm1S_results.pkl', 'wb'))

svm2S_results = fit_svm(x2S_train, y_train, x2S_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 2S",svm2S_results)
pickle.dump(svm2S_results, open('data/svm/svm2S_results.pkl', 'wb'))

svm3S_results = fit_svm(x3S_train, y_train, x3S_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 3S",svm3S_results)
pickle.dump(svm3S_results, open('data/svm/svm3S_results.pkl', 'wb'))

svm4S_results = fit_svm(x4S_train, y_train, x4S_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 4S",svm4S_results)
pickle.dump(svm4S_results, open('data/svm/svm4S_results.pkl', 'wb'))


-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACHINE - CONJUNTO 1L
-------------------------------------------------------------------
Accuracy: 0.806
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[770  14]
 [180  36]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.81      0.98      0.89       784
           1       0.72      0.17      0.27       216

    accuracy                           0.81      1000
   macro avg       0.77      0.57      0.58      1000
weighted avg       0.79      0.81      0.75      1000

-------------------------------------------------------------------

-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACH

In [17]:
from sklearn.model_selection import GridSearchCV

def fit_and_tune_svm(x_train, y_train, x_test, y_test, custom_grid = None) -> dict:
    """
    Searches for the best hyperparameters for a svm classifier using x_train and y_train
    Fits a svm classifier using x_train and y_train
    Calculates classification metrics and returns it with the model
    Hyperparameters are tuned
    :returns: dict with keys
        - model: SVClassifier
        - accuracy: float
        - confusion_matrix: np.array
        - classification_report: str
    """
    param_grid = custom_grid if custom_grid else {
        'C': [0.1, 1, 10, 100, 1000],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'random_state': [42]
    }
    grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1, refit=True)
    grid_search.fit(x_train, y_train)
    svm = grid_search.best_estimator_
    y_pred = svm.predict(x_test)
    return {
        'model': svm,
        'accuracy': accuracy_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, zero_division=0)
    }

In [18]:
svm1L_tune_results = fit_and_tune_svm(x1L_train, y_train, x1L_test, y_test)
pickle.dump(svm1L_tune_results, open('data/svm/tuned_svm1L_results.pkl', 'wb'))
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 1L",svm1L_tune_results)

-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACHINE - CONJUNTO 1L
-------------------------------------------------------------------
Accuracy: 0.811
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[776   8]
 [181  35]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.81      0.99      0.89       784
           1       0.81      0.16      0.27       216

    accuracy                           0.81      1000
   macro avg       0.81      0.58      0.58      1000
weighted avg       0.81      0.81      0.76      1000

-------------------------------------------------------------------



In [19]:
svm2L_tune_results = fit_and_tune_svm(x2L_train, y_train, x2L_test, y_test)
pickle.dump(svm2L_tune_results, open('data/svm/tuned_svm2L_results.pkl', 'wb'))
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 2L",svm2L_tune_results)

-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACHINE - CONJUNTO 2L
-------------------------------------------------------------------
Accuracy: 0.845
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[755  29]
 [126  90]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       784
           1       0.76      0.42      0.54       216

    accuracy                           0.84      1000
   macro avg       0.81      0.69      0.72      1000
weighted avg       0.84      0.84      0.83      1000

-------------------------------------------------------------------



In [20]:
svm3L_tune_results = fit_and_tune_svm(x3L_train, y_train, x3L_test, y_test)
pickle.dump(svm3L_tune_results, open('data/svm/tuned_svm3L_results.pkl', 'wb'))
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 3L",svm3L_tune_results)

-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACHINE - CONJUNTO 3L
-------------------------------------------------------------------
Accuracy: 0.809
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[761  23]
 [168  48]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.82      0.97      0.89       784
           1       0.68      0.22      0.33       216

    accuracy                           0.81      1000
   macro avg       0.75      0.60      0.61      1000
weighted avg       0.79      0.81      0.77      1000

-------------------------------------------------------------------



In [21]:
svm4L_tune_results = fit_and_tune_svm(x4L_train, y_train, x4L_test, y_test)
pickle.dump(svm4L_tune_results, open('data/svm/tuned_svm4L_results.pkl', 'wb'))
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 4L",svm4L_tune_results)

-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACHINE - CONJUNTO 4L
-------------------------------------------------------------------
Accuracy: 0.828
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[771  13]
 [159  57]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.83      0.98      0.90       784
           1       0.81      0.26      0.40       216

    accuracy                           0.83      1000
   macro avg       0.82      0.62      0.65      1000
weighted avg       0.83      0.83      0.79      1000

-------------------------------------------------------------------



In [22]:
svm1S_tune_results = fit_and_tune_svm(x1S_train, y_train, x1S_test, y_test)
pickle.dump(svm1S_tune_results, open('data/svm/tuned_svm1S_results.pkl', 'wb'))
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 1S",svm1S_tune_results)

-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACHINE - CONJUNTO 1S
-------------------------------------------------------------------
Accuracy: 0.816
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[778   6]
 [178  38]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.81      0.99      0.89       784
           1       0.86      0.18      0.29       216

    accuracy                           0.82      1000
   macro avg       0.84      0.58      0.59      1000
weighted avg       0.82      0.82      0.76      1000

-------------------------------------------------------------------



In [23]:
svm2S_tune_results = fit_and_tune_svm(x2S_train, y_train, x2S_test, y_test)
pickle.dump(svm2S_tune_results, open('data/svm/tuned_svm2S_results.pkl', 'wb'))
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 2S",svm2S_tune_results)

-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACHINE - CONJUNTO 2S
-------------------------------------------------------------------
Accuracy: 0.846
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[757  27]
 [127  89]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.86      0.97      0.91       784
           1       0.77      0.41      0.54       216

    accuracy                           0.85      1000
   macro avg       0.81      0.69      0.72      1000
weighted avg       0.84      0.85      0.83      1000

-------------------------------------------------------------------



In [24]:
svm3S_tune_results = fit_and_tune_svm(x3S_train, y_train, x3S_test, y_test)
pickle.dump(svm3S_tune_results, open('data/svm/tuned_svm3S_results.pkl', 'wb'))
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 3S",svm3S_tune_results)

-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACHINE - CONJUNTO 3S
-------------------------------------------------------------------
Accuracy: 0.821
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[771  13]
 [166  50]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.82      0.98      0.90       784
           1       0.79      0.23      0.36       216

    accuracy                           0.82      1000
   macro avg       0.81      0.61      0.63      1000
weighted avg       0.82      0.82      0.78      1000

-------------------------------------------------------------------



In [25]:
svm4S_tune_results = fit_and_tune_svm(x4S_train, y_train, x4S_test, y_test)
pickle.dump(svm4S_tune_results, open('data/svm/tuned_svm4S_results.pkl', 'wb'))
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 4S",svm4S_tune_results)

-------------------------------------------------------------------
Resultados del entrenamiento: SUPPORT VECTOR MACHINE - CONJUNTO 4S
-------------------------------------------------------------------
Accuracy: 0.841
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[752  32]
 [127  89]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.86      0.96      0.90       784
           1       0.74      0.41      0.53       216

    accuracy                           0.84      1000
   macro avg       0.80      0.69      0.72      1000
weighted avg       0.83      0.84      0.82      1000

-------------------------------------------------------------------



# Random Forest Classifier

In [26]:
# import random forest
from sklearn.ensemble import RandomForestClassifier

def fit_and_tune_random_forest(x_train, y_train, x_test, y_test) -> dict:
    """
    Searches for the best hyperparameters for a random forest classifier using x_train and y_train
    Fits a random forest classifier using x_train and y_train
    Calculates classification metrics and returns it with the model
    Hyperparameters are tuned
    :returns: dict with keys
        - model: RandomForestClassifier
        - accuracy: float
        - confusion_matrix: np.array
        - classification_report: str
    """
    param_grid = {
        'n_estimators': [10, 50, 100, 200, 500],
        'max_depth': [None, 2, 5, 10],
        'criterion': ['gini', 'entropy'],
        'random_state': [42]
    }
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, n_jobs=-1, refit=True)
    grid_search.fit(x_train, y_train)
    rf = grid_search.best_estimator_
    y_pred = rf.predict(x_test)
    return {
        'model': rf,
        'accuracy': accuracy_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, zero_division=0)
    }

In [27]:
tuned_rfc1L_results = fit_and_tune_random_forest(x1L_train, y_train, x1L_test, y_test)
pickle.dump(tuned_rfc1L_results, open('data/rfc/tuned_rfc1L_results.pkl', 'wb'))
show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 1L", tuned_rfc1L_results)

-------------------------------------------------------------------
Resultados del entrenamiento: RANDOM FOREST CLASSIFIER - CONJUNTO 1L
-------------------------------------------------------------------
Accuracy: 0.853
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[778   6]
 [141  75]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.85      0.99      0.91       784
           1       0.93      0.35      0.51       216

    accuracy                           0.85      1000
   macro avg       0.89      0.67      0.71      1000
weighted avg       0.86      0.85      0.83      1000

-------------------------------------------------------------------



In [28]:
tuned_rfc2L_results = fit_and_tune_random_forest(x2L_train, y_train, x2L_test, y_test)
pickle.dump(tuned_rfc2L_results, open('data/rfc/tuned_rfc2L_results.pkl', 'wb'))
show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 2L", tuned_rfc2L_results)

-------------------------------------------------------------------
Resultados del entrenamiento: RANDOM FOREST CLASSIFIER - CONJUNTO 2L
-------------------------------------------------------------------
Accuracy: 0.878
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[772  12]
 [110 106]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       784
           1       0.90      0.49      0.63       216

    accuracy                           0.88      1000
   macro avg       0.89      0.74      0.78      1000
weighted avg       0.88      0.88      0.86      1000

-------------------------------------------------------------------



In [29]:
tuned_rfc3L_results = fit_and_tune_random_forest(x3L_train, y_train, x3L_test, y_test)
pickle.dump(tuned_rfc3L_results, open('data/rfc/tuned_rfc3L_results.pkl', 'wb'))
show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 3L", tuned_rfc3L_results)

-------------------------------------------------------------------
Resultados del entrenamiento: RANDOM FOREST CLASSIFIER - CONJUNTO 3L
-------------------------------------------------------------------
Accuracy: 0.836
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[774  10]
 [154  62]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.83      0.99      0.90       784
           1       0.86      0.29      0.43       216

    accuracy                           0.84      1000
   macro avg       0.85      0.64      0.67      1000
weighted avg       0.84      0.84      0.80      1000

-------------------------------------------------------------------



In [30]:
tuned_rfc4L_results = fit_and_tune_random_forest(x4L_train, y_train, x4L_test, y_test)
pickle.dump(tuned_rfc4L_results, open('data/rfc/tuned_rfc4L_results.pkl', 'wb'))
show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 4L", tuned_rfc4L_results)

-------------------------------------------------------------------
Resultados del entrenamiento: RANDOM FOREST CLASSIFIER - CONJUNTO 4L
-------------------------------------------------------------------
Accuracy: 0.869
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[775   9]
 [122  94]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       784
           1       0.91      0.44      0.59       216

    accuracy                           0.87      1000
   macro avg       0.89      0.71      0.76      1000
weighted avg       0.87      0.87      0.85      1000

-------------------------------------------------------------------



In [31]:
tuned_rfc1S_results = fit_and_tune_random_forest(x1S_train, y_train, x1S_test, y_test)
pickle.dump(tuned_rfc1S_results, open('data/rfc/tuned_rfc1S_results.pkl', 'wb'))
show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 1S", tuned_rfc1S_results)

-------------------------------------------------------------------
Resultados del entrenamiento: RANDOM FOREST CLASSIFIER - CONJUNTO 1S
-------------------------------------------------------------------
Accuracy: 0.849
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[779   5]
 [146  70]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       784
           1       0.93      0.32      0.48       216

    accuracy                           0.85      1000
   macro avg       0.89      0.66      0.70      1000
weighted avg       0.86      0.85      0.82      1000

-------------------------------------------------------------------



In [32]:
tuned_rfc2S_results = fit_and_tune_random_forest(x2S_train, y_train, x2S_test, y_test)
pickle.dump(tuned_rfc2S_results, open('data/rfc/tuned_rfc2S_results.pkl', 'wb'))
show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 2S", tuned_rfc2S_results)

-------------------------------------------------------------------
Resultados del entrenamiento: RANDOM FOREST CLASSIFIER - CONJUNTO 2S
-------------------------------------------------------------------
Accuracy: 0.871
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[776   8]
 [121  95]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.87      0.99      0.92       784
           1       0.92      0.44      0.60       216

    accuracy                           0.87      1000
   macro avg       0.89      0.71      0.76      1000
weighted avg       0.88      0.87      0.85      1000

-------------------------------------------------------------------



In [33]:
tuned_rfc3S_results = fit_and_tune_random_forest(x3S_train, y_train, x3S_test, y_test)
pickle.dump(tuned_rfc3S_results, open('data/rfc/tuned_rfc3S_results.pkl', 'wb'))
show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 3S", tuned_rfc3S_results)

-------------------------------------------------------------------
Resultados del entrenamiento: RANDOM FOREST CLASSIFIER - CONJUNTO 3S
-------------------------------------------------------------------
Accuracy: 0.832
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[772  12]
 [156  60]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.83      0.98      0.90       784
           1       0.83      0.28      0.42       216

    accuracy                           0.83      1000
   macro avg       0.83      0.63      0.66      1000
weighted avg       0.83      0.83      0.80      1000

-------------------------------------------------------------------



In [34]:
tuned_rfc4S_results = fit_and_tune_random_forest(x4S_train, y_train, x4S_test, y_test)
pickle.dump(tuned_rfc4S_results, open('data/rfc/tuned_rfc4S_results.pkl', 'wb'))
show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 4S", tuned_rfc4S_results)

-------------------------------------------------------------------
Resultados del entrenamiento: RANDOM FOREST CLASSIFIER - CONJUNTO 4S
-------------------------------------------------------------------
Accuracy: 0.871
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[771  13]
 [116 100]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       784
           1       0.88      0.46      0.61       216

    accuracy                           0.87      1000
   macro avg       0.88      0.72      0.77      1000
weighted avg       0.87      0.87      0.85      1000

-------------------------------------------------------------------

