In [None]:
import os
import pandas as pd


DIRECTORY = os.path.join('..', 'Input', 'brain_conditions.csv')

data = pd.read_csv(DIRECTORY, sep=';')


data.info()
print('\n')
data.describe().transpose()

In [None]:
data['Treatment'].value_counts(normalize = True)

In [None]:
display(
    data.head(3),
    data.tail(3)
)

In [None]:
treatments = set(data['Treatment'])

for treatment in treatments:

    txts = data[data['Treatment'] == treatment]['Clinical Note'].head(3)

    print(f'\n{treatment}:')
    
    for txt in txts:
        
        print(txt)

In [None]:
treatments = set(data['Treatmentimport re

data.dropna(inplace=True)
data[['Condition', 'Sex', 'Treatment']] = data[['Condition', 'Sex', 'Treatment']].astype('category')

data['Clinical_Note_copy'] = data['Clinical Note'].copy()
data['Clinical_Note_copy'] = data['Clinical_Note_copy'].str.lower()
#Números
data['Clinical_Note_copy'] = data['Clinical_Note_copy'].apply(lambda txt: re.sub(r'\d+', ' ', txt))
#espacios multiples
data['Clinical_Note_copy'] = data['Clinical_Note_copy'].apply(lambda txt: re.sub(r'\s+', ' ', txt).strip())
data

In [None]:
data.info()

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")


def lemmatize_and_remove_stopwords(txt):

    doc = nlp(txt)

    removed = []
    kept = []

    for token in doc:

        lemma = token.lemma_

        if lemma in STOP_WORDS:
            removed.append(lemma)

        else:
            kept.append(lemma)

    return ' '.join(kept), removed


data[['Clinical_Note_copy', 'removed_stopwords']] = data['Clinical_Note_copy'].apply(
    lambda x: pd.Series(lemmatize_and_remove_stopwords(x))
)

data

In [None]:
all_removed = data['removed_stopwords'].explode()
unique_removed = all_removed.dropna().unique()

unique_removed

In [None]:
from sklearn.model_selection import train_test_split


data = data.sample(frac=1, random_state=666)

X = data[['Condition', 'Age', 'Sex', 'Clinical_Note_copy']]
y = data[['Treatment']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

In [None]:
from sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier



categorical_features = X_train.select_dtypes(include='category').columns.tolist()
txt_features = X_train.select_dtypes(include='object').columns.tolist()
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()


categorical_pipeline = Pipeline(
    [
        ('ohe', OneHotEncoder(handle_unknown='ignore', drop='if_binary', sparse_output=False))
    ]
)


def extract_first_column(x):
    return x.iloc[:, 0]

txt_pipeline = Pipeline(
    [
        ('extract', FunctionTransformer(extract_first_column, validate=False)),
        ('Tfidf', TfidfVectorizer(max_df=0.7, min_df=0.004))
    ]
)

numerical_pipeline = Pipeline(
    [
        ('standard', StandardScaler())
    ]
)

transformer = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('txt', txt_pipeline, txt_features),
        ('num', numerical_pipeline, numerical_features)
    ]
)


pipe = Pipeline(
    [
        ('preprocessor', transformer),
        ('classifier', RandomForestClassifier(random_state=666, n_jobs=-1))
    ]
)

pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score

y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train, average='micro'))
print('precision score test:', precision_score(y_test, y_pred_test, average='micro'))
print('recall score train:', recall_score(y_train, y_pred_train, average='micro'))
print('recall score test:', recall_score(y_test, y_pred_test, average='micro'))
print('f1 score train:', f1_score(y_train, y_pred_train, average='micro'))
print('f1 score test:', f1_score(y_test, y_pred_test, average='micro'))

In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np

def matriz(yt, yp):
    labels = np.unique(yt)
    matrix = confusion_matrix(y_true=yt, y_pred=yp, labels=labels)
    
    index = [f"{label} (Clase Real)" for label in labels]
    columns = [f"{label} (Predicción)" for label in labels]
    
    return pd.DataFrame(matrix, index=index, columns=columns)


matrix_train = matriz(y_train, pipe.predict(X_train))
matrix_test = matriz(y_test, pipe.predict(X_test))

display(matrix_train, matrix_test)

In [None]:
rf_model = pipe.named_steps['classifier']

profundidad_promedio = np.mean([tree.tree_.max_depth for tree in rf_model.estimators_])
nodos_promedio = np.mean([tree.tree_.node_count for tree in rf_model.estimators_])

print(f'max_depth AVG: {profundidad_promedio}')
print(f'nodes number AVG: {nodos_promedio}')

In [None]:
preprocessor = pipe.named_steps['preprocessor']

cat_ohe = preprocessor.named_transformers_['cat'].named_steps['ohe']
cat_feature_names = cat_ohe.get_feature_names_out(categorical_features)

txt_vectorizer = preprocessor.named_transformers_['txt'].named_steps['Tfidf']
txt_feature_names = txt_vectorizer.get_feature_names_out()

num_feature_names = numerical_features

feature_names = np.concatenate([cat_feature_names, txt_feature_names, num_feature_names])

rf = pipe.named_steps['classifier']

importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': rf.feature_importances_
})

importances_df = importances_df.sort_values(by='importance', ascending=False)
importances_df.head(20)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__min_samples_split': np.arange(0.0001, 0.00035, 0.00005),
    'preprocessor__txt__Tfidf__ngram_range': [(1, 1), (1, 2)],
    'classifier__n_estimators': [45, 55, 65],
    'classifier__max_depth': [38, 34],
    'classifier__criterion': ['gini', 'entropy']
}


model = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=4,
    scoring='balanced_accuracy',
    n_jobs=-1,
    refit=True,
    verbose=1
)

model.fit(X_train, y_train)

In [None]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print('mejores parámetros encontrados:', model.best_params_)
print()
print('balanced accuracy train:', balanced_accuracy_score(y_train, y_pred_train))
print('balanced accuracy test:', balanced_accuracy_score(y_test, y_pred_test))
print('precision score train:', precision_score(y_train, y_pred_train, average='micro'))
print('precision score test:', precision_score(y_test, y_pred_test, average='micro'))
print('recall score train:', recall_score(y_train, y_pred_train, average='micro'))
print('recall score test:', recall_score(y_test, y_pred_test, average='micro'))
print('f1 score train:', f1_score(y_train, y_pred_train, average='micro'))
print('f1 score test:', f1_score(y_test, y_pred_test, average='micro'))
print()
print('score train:', model.score(X_train, y_train))
print('score test:', model.score(X_test, y_test))

In [None]:
matrix_train = matriz(y_train, model.predict(X_train))
matrix_test = matriz(y_test, model.predict(X_test))

display(matrix_train, matrix_test)

In [None]:
results = pd.DataFrame(model.cv_results_).sort_values('rank_test_score').reset_index(drop=True)

results = results[
    [
        'param_classifier__criterion',
        'param_preprocessor__txt__Tfidf__ngram_range',
        'param_classifier__n_estimators',
        'param_classifier__max_depth',
        'mean_test_score',
        'std_test_score',
        'rank_test_score'
    ]
].sort_values('mean_test_score', ascending=False).reset_index(drop=True)

results

In [None]:
# import pickle

# os.makedirs('../Output/models', exist_ok=True)

# with open('../Output/models/model_txt.pkl', 'wb') as file:
#     pickle.dump(model, file)