In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_reports = pd.read_pickle('data/preprocessed_reports.pkl')
df_report_pairs = pd.read_pickle('data/preprocessed_report_pairs.pkl')

tfidf1L = pickle.load(open('data/vectorizer/tfidf1L.pkl', 'rb'))
tfidf2L = pickle.load(open('data/vectorizer/tfidf2L.pkl', 'rb'))
tfidf3L = pickle.load(open('data/vectorizer/tfidf3L.pkl', 'rb'))
tfidf4L = pickle.load(open('data/vectorizer/tfidf4L.pkl', 'rb'))
tfidf1S = pickle.load(open('data/vectorizer/tfidf1S.pkl', 'rb'))
tfidf2S = pickle.load(open('data/vectorizer/tfidf2S.pkl', 'rb'))
tfidf3S = pickle.load(open('data/vectorizer/tfidf3S.pkl', 'rb'))
tfidf4S = pickle.load(open('data/vectorizer/tfidf4S.pkl', 'rb'))

# Preparación del dataset

In [3]:
df_report_pairs.columns

Index(['type', 'id_M', 'id_D', 'statusM', 'statusD', 'componentM',
       'componentD', 'summaryM', 'summaryD', 'commentsM', 'commentsD',
       'text1M', 'text1D', 'text2M', 'text2D', 'text3M', 'text3D', 'text4M',
       'text4D', 'tokens1ML', 'tokens2ML', 'tokens3ML', 'tokens4ML',
       'tokens1DL', 'tokens2DL', 'tokens3DL', 'tokens4DL', 'tokens1MS',
       'tokens2MS', 'tokens3MS', 'tokens4MS', 'tokens1DS', 'tokens2DS',
       'tokens3DS', 'tokens4DS'],
      dtype='object')

In [4]:
# A cada reporte M y D de cada par se aplicaría:
# textP = textM + textD
# tokensP = tokenizer(textP)
# Para ahorrar tiempo se usarán los tokens ya obtenidos

df_report_pairs['text1P'] = df_report_pairs['text1M'] + df_report_pairs['text1D']
df_report_pairs['text2P'] = df_report_pairs['text2M'] + df_report_pairs['text2D']
df_report_pairs['text3P'] = df_report_pairs['text3M'] + df_report_pairs['text3D']
df_report_pairs['text4P'] = df_report_pairs['text4M'] + df_report_pairs['text4D']

In [5]:
df_report_pairs['tokens1PL'] = df_report_pairs['tokens1ML'] + df_report_pairs['tokens1DL']
df_report_pairs['tokens2PL'] = df_report_pairs['tokens2ML'] + df_report_pairs['tokens2DL']
df_report_pairs['tokens3PL'] = df_report_pairs['tokens3ML'] + df_report_pairs['tokens3DL']
df_report_pairs['tokens4PL'] = df_report_pairs['tokens4ML'] + df_report_pairs['tokens4DL']
df_report_pairs['tokens1PS'] = df_report_pairs['tokens1MS'] + df_report_pairs['tokens1DS']
df_report_pairs['tokens2PS'] = df_report_pairs['tokens2MS'] + df_report_pairs['tokens2DS']
df_report_pairs['tokens3PS'] = df_report_pairs['tokens3MS'] + df_report_pairs['tokens3DS']
df_report_pairs['tokens4PS'] = df_report_pairs['tokens4MS'] + df_report_pairs['tokens4DS']

In [6]:
data1L = tfidf1L.transform(df_report_pairs['tokens1PL'].apply(' '.join)).toarray()
data2L = tfidf2L.transform(df_report_pairs['tokens2PL'].apply(' '.join)).toarray()
data3L = tfidf3L.transform(df_report_pairs['tokens3PL'].apply(' '.join)).toarray()
data4L = tfidf4L.transform(df_report_pairs['tokens4PL'].apply(' '.join)).toarray()
data1S = tfidf1S.transform(df_report_pairs['tokens1PS'].apply(' '.join)).toarray()
data2S = tfidf2S.transform(df_report_pairs['tokens2PS'].apply(' '.join)).toarray()
data3S = tfidf3S.transform(df_report_pairs['tokens3PS'].apply(' '.join)).toarray()
data4S = tfidf4S.transform(df_report_pairs['tokens4PS'].apply(' '.join)).toarray()

In [7]:
print(
    data1L.shape,  data2L.shape, data3L.shape, data4L.shape, '\n',
    data1S.shape,  data2S.shape, data3S.shape, data4S.shape
)

(500, 1986) (500, 2015) (500, 5219) (500, 2445) 
 (500, 1695) (500, 1714) (500, 4324) (500, 2056)


In [8]:
df_report_pairs['duplicate'] = df_report_pairs['type'].apply(lambda x: 0 if x=='master' else 1)
df_report_pairs['duplicate'].value_counts()

0    400
1    100
Name: duplicate, dtype: int64

# Preparación de los conjuntos de entrenamiento y test

In [9]:
from sklearn.model_selection import train_test_split

y = df_report_pairs['duplicate'].values

x1L_train, x1L_test, y_train, y_test = train_test_split(data1L, y, test_size=0.2, random_state=42)
x2L_train, x2L_test, y_train, y_test = train_test_split(data2L, y, test_size=0.2, random_state=42)
x3L_train, x3L_test, y_train, y_test = train_test_split(data3L, y, test_size=0.2, random_state=42)
x4L_train, x4L_test, y_train, y_test = train_test_split(data4L, y, test_size=0.2, random_state=42)
x1S_train, x1S_test, y_train, y_test = train_test_split(data1S, y, test_size=0.2, random_state=42)
x2S_train, x2S_test, y_train, y_test = train_test_split(data2S, y, test_size=0.2, random_state=42)
x3S_train, x3S_test, y_train, y_test = train_test_split(data3S, y, test_size=0.2, random_state=42)
x4S_train, x4S_test, y_train, y_test = train_test_split(data4S, y, test_size=0.2, random_state=42)

In [10]:
x1L_train.shape, x1L_test.shape, y_train.shape, y_test.shape

((400, 1986), (100, 1986), (400,), (100,))

In [11]:
type(x1L_train), type(x1L_test), type(y_train), type(y_test)

(numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)

In [12]:
type(x1L_train[0]), type(x1L_test[0]), type(y_train[0]), type(y_test[0])

(numpy.ndarray, numpy.ndarray, numpy.int64, numpy.int64)

# Máquinas de soporte vectorial

In [13]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


def fit_svm(x_train, y_train, x_test, y_test) -> dict:
    """
    Fits a svm classifier using x_train and y_train
    Calculates classification metrics and returns it with the model
    No hyperparameters are tuned
    :returns: dict with keys
        - model: SVClassifier
        - accuracy: float
        - confusion_matrix: np.array
        - classification_report: str
    """
    svm = SVC(kernel='linear', C=1.0, random_state=42)
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    return {
        'model': svm,
        'accuracy': accuracy_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, zero_division=0)
    }

In [14]:
def show_model_results(title: str, svm_results: dict):
    print("-------------------------------------------------------------------")
    print("Resultados del entrenamiento con Máquina de Soporte Vectorial (SVM)")
    print(f"CONJUNTO: {title}")
    print("-------------------------------------------------------------------")
    print(f"Accuracy: {svm_results['accuracy']}")
    print("-------------------------------------------------------------------")
    print("Matriz de Confusión")
    print("-------------------------------")
    print(svm_results['confusion_matrix'])
    print("-------------------------------------------------------------------")
    print("Reporte de Clasificación")
    print("-------------------------------")
    print(svm_results['classification_report'])
    print("-------------------------------------------------------------------\n")

In [15]:
svm1L_results = fit_svm(x1L_train, y_train, x1L_test, y_test)
svm2L_results = fit_svm(x2L_train, y_train, x2L_test, y_test)
svm3L_results = fit_svm(x3L_train, y_train, x3L_test, y_test)
svm4L_results = fit_svm(x4L_train, y_train, x4L_test, y_test)
svm1S_results = fit_svm(x1S_train, y_train, x1S_test, y_test)
svm2S_results = fit_svm(x2S_train, y_train, x2S_test, y_test)
svm3S_results = fit_svm(x3S_train, y_train, x3S_test, y_test)
svm4S_results = fit_svm(x4S_train, y_train, x4S_test, y_test)

In [16]:
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 1L",svm1L_results)
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 2L",svm2L_results)
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 3L",svm3L_results)
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 4L",svm4L_results)
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 1S",svm1S_results)
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 2S",svm2S_results)
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 3S",svm3S_results)
show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 4S",svm4S_results)

-------------------------------------------------------------------
Resultados del entrenamiento con Máquina de Soporte Vectorial (SVM)
CONJUNTO: SUPPORT VECTOR MACHINE - CONJUNTO 1L
-------------------------------------------------------------------
Accuracy: 0.71
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[71  1]
 [28  0]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.72      0.99      0.83        72
           1       0.00      0.00      0.00        28

    accuracy                           0.71       100
   macro avg       0.36      0.49      0.42       100
weighted avg       0.52      0.71      0.60       100

-------------------------------------------------------------------

-------------------------------------------------------------------
Result

In [17]:
# import grid search
from sklearn.model_selection import GridSearchCV

def fit_and_tune_svm(x_train, y_train, x_test, y_test, custom_grid = None) -> dict:
    """
    Searches for the best hyperparameters for a svm classifier using x_train and y_train
    Fits a svm classifier using x_train and y_train
    Calculates classification metrics and returns it with the model
    Hyperparameters are tuned
    :returns: dict with keys
        - model: SVClassifier
        - accuracy: float
        - confusion_matrix: np.array
        - classification_report: str
    """
    param_grid = custom_grid if custom_grid else {
        'C': [0.1, 1, 10, 100, 1000],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'random_state': [42]
    }
    grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1, refit=True)
    grid_search.fit(x_train, y_train)
    svm = grid_search.best_estimator_
    y_pred = svm.predict(x_test)
    return {
        'model': svm,
        'accuracy': accuracy_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, zero_division=0)
    }

In [18]:
svm1L_tune_results = fit_and_tune_svm(x1L_train, y_train, x1L_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 1L",svm1L_tune_results)
svm2L_tune_results = fit_and_tune_svm(x2L_train, y_train, x2L_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 2L",svm2L_tune_results)
svm3L_tune_results = fit_and_tune_svm(x3L_train, y_train, x3L_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 3L",svm3L_tune_results)
svm4L_tune_results = fit_and_tune_svm(x4L_train, y_train, x4L_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 4L",svm4L_tune_results)
svm1S_tune_results = fit_and_tune_svm(x1S_train, y_train, x1S_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 1S",svm1S_tune_results)
svm2S_tune_results = fit_and_tune_svm(x2S_train, y_train, x2S_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 2S",svm2S_tune_results)
svm3S_tune_results = fit_and_tune_svm(x3S_train, y_train, x3S_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 3S",svm3S_tune_results)
svm4S_tune_results = fit_and_tune_svm(x4S_train, y_train, x4S_test, y_test); show_model_results("SUPPORT VECTOR MACHINE - CONJUNTO 4S",svm4S_tune_results)

-------------------------------------------------------------------
Resultados del entrenamiento con Máquina de Soporte Vectorial (SVM)
CONJUNTO: SUPPORT VECTOR MACHINE - CONJUNTO 1L
-------------------------------------------------------------------
Accuracy: 0.73
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[70  2]
 [25  3]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.74      0.97      0.84        72
           1       0.60      0.11      0.18        28

    accuracy                           0.73       100
   macro avg       0.67      0.54      0.51       100
weighted avg       0.70      0.73      0.65       100

-------------------------------------------------------------------

-------------------------------------------------------------------
Result

# Random Forest Classifier

In [19]:
# import random forest
from sklearn.ensemble import RandomForestClassifier

def fit_and_tune_random_forest(x_train, y_train, x_test, y_test) -> dict:
    """
    Searches for the best hyperparameters for a random forest classifier using x_train and y_train
    Fits a random forest classifier using x_train and y_train
    Calculates classification metrics and returns it with the model
    Hyperparameters are tuned
    :returns: dict with keys
        - model: RandomForestClassifier
        - accuracy: float
        - confusion_matrix: np.array
        - classification_report: str
    """
    param_grid = {
        'n_estimators': [10, 50, 100, 200, 500],
        'max_depth': [None, 2, 5, 10],
        'criterion': ['gini', 'entropy'],
        'random_state': [42]
    }
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, n_jobs=-1, refit=True)
    grid_search.fit(x_train, y_train)
    rf = grid_search.best_estimator_
    y_pred = rf.predict(x_test)
    return {
        'model': rf,
        'accuracy': accuracy_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, zero_division=0)
    }

In [20]:
rfc1L = fit_and_tune_random_forest(x1L_train, y_train, x1L_test, y_test); show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 1L", rfc1L)
rfc2L = fit_and_tune_random_forest(x2L_train, y_train, x2L_test, y_test); show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 2L", rfc2L)
rfc3L = fit_and_tune_random_forest(x3L_train, y_train, x3L_test, y_test); show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 3L", rfc3L)
rfc4L = fit_and_tune_random_forest(x4L_train, y_train, x4L_test, y_test); show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 4L", rfc4L)
rfc1S = fit_and_tune_random_forest(x1S_train, y_train, x1S_test, y_test); show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 1S", rfc1S)
rfc2S = fit_and_tune_random_forest(x2S_train, y_train, x2S_test, y_test); show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 2S", rfc2S)
rfc3S = fit_and_tune_random_forest(x3S_train, y_train, x3S_test, y_test); show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 3S", rfc3S)
rfc4S = fit_and_tune_random_forest(x4S_train, y_train, x4S_test, y_test); show_model_results("RANDOM FOREST CLASSIFIER - CONJUNTO 4S", rfc4S)

-------------------------------------------------------------------
Resultados del entrenamiento con Máquina de Soporte Vectorial (SVM)
CONJUNTO: RANDOM FOREST CLASSIFIER - CONJUNTO 1L
-------------------------------------------------------------------
Accuracy: 0.73
-------------------------------------------------------------------
Matriz de Confusión
-------------------------------
[[70  2]
 [25  3]]
-------------------------------------------------------------------
Reporte de Clasificación
-------------------------------
              precision    recall  f1-score   support

           0       0.74      0.97      0.84        72
           1       0.60      0.11      0.18        28

    accuracy                           0.73       100
   macro avg       0.67      0.54      0.51       100
weighted avg       0.70      0.73      0.65       100

-------------------------------------------------------------------

-------------------------------------------------------------------
Resu