In [9]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from gensim.models import Word2Vec
import numpy as np
import pickle
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
import re

In [3]:
train = pd.read_csv("dataset/train/train.csv")
test = pd.read_csv("dataset/test/test.csv")

In [13]:
texts = train['message'].apply(preprocess_string).tolist()

# Entrenamos el modelo word2vec para vectorizar el texto
model_w2v = Word2Vec(sentences=texts, window=5, min_count=1, workers=-1)
model_w2v.save("./models/word2vec.model")
word_vectors = model_w2v.wv

In [14]:
longitud = train['message'].str.split()
long_max = longitud.str.len().max()

In [15]:
def obtener_vector_promedio(texto):
    tamano_vector = model_w2v.vector_size
    # Inicializar un vector de ceros
    vector_promedio = np.zeros(tamano_vector)
    
    # Filtrar las palabras que están en el vocabulario del modelo Word2Vec
    palabras_validas = [palabra for palabra in texto if palabra in model_w2v.wv.key_to_index]
    
    # Si hay palabras válidas, calcular el vector promedio
    if palabras_validas:
        vectores_palabras = np.array([model_w2v.wv[palabra] for palabra in palabras_validas])
        vector_promedio = vectores_palabras.mean(axis=0)
    
    return vector_promedio

In [16]:
# Aplicar la función de vectorización a los textos limpios
X_train = np.array([obtener_vector_promedio(texto) for texto in train['message'].apply(preprocess_string)])
X_test = np.array([obtener_vector_promedio(texto) for texto in test['message'].apply(preprocess_string)])
# Las etiquetas de las categorías
y_train = train.drop(columns=['message', 'subquestion_focus', 'answer'])
y_test = test.drop(columns=['message', 'subquestion_focus', 'answer'])

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
# Definir el modelo Random Forest
model = RandomForestClassifier(criterion='gini', n_estimators=130, min_samples_split=3, random_state=15, class_weight='balanced_subsample')
# Entrenar el modelo
model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [18]:
y_pred = model.predict(X_test)

print("Report:\n", classification_report(y_test, y_pred))

Report:
                       precision    recall  f1-score   support

         Association       1.00      0.71      0.83         7
               Cause       1.00      0.93      0.96        14
        Complication       0.78      1.00      0.88         7
         Diagnose_me       1.00      1.00      1.00         1
           Diagnosis       1.00      0.78      0.88         9
          Indication       1.00      1.00      1.00         1
         Information       0.96      0.84      0.90        31
        Organization       1.00      0.82      0.90        17
           Prognosis       0.62      0.71      0.67         7
      Susceptibility       1.00      0.77      0.87        13
             Symptom       1.00      0.83      0.91         6
           Treatment       0.86      0.94      0.90       131
         association       0.67      1.00      0.80         2
               cause       0.61      0.73      0.67        26
        complication       1.00      1.00      1.00         

In [21]:
with open('model/model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [2]:
model_name = "distilgpt2"
model_gpt = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer_gpt = GPT2Tokenizer.from_pretrained(model_name)

In [18]:
# Crear una nueva columna en el DataFrame con preguntas modificadas
train['modified_query'] = 'Could you provide me with ' + train['subquestion_type'] + ' for ' + train['message'] + '?'

# Seleccionar las columnas relevantes para las preguntas y respuestas
qa_data = train.loc[:, ['modified_query', 'answer']]

# Extraer las preguntas y respuestas como listas
query_list = qa_data['modified_query'].tolist()
response_list = qa_data['answer'].tolist()

# Concatenar las preguntas y respuestas para el entrenamiento
training_data = [f"Query: {q} Response: {r}" for q, r in zip(query_list, response_list)]

# Tokenización
tokenizer_gpt.pad_token = tokenizer_gpt.eos_token
encoded_inputs = tokenizer_gpt(training_data, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Definir un Dataset personalizado
class CustomQADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Crear una instancia del Dataset personalizado
custom_dataset = CustomQADataset(encoded_inputs)

In [19]:
# Configuración de los argumentos de entrenamiento
training_config = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=500,
    logging_dir="./logs",
)

# Configuración del data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_gpt,
    mlm=False,
)

# Inicialización del Trainer
trainer_instance = Trainer(
    model=model_gpt,
    args=training_config,
    data_collator=data_collator,
    train_dataset=custom_dataset,
)

# Iniciar el entrenamiento
trainer_instance.train()

 53%|█████▎    | 500/948 [23:56<21:43,  2.91s/it]

{'loss': 2.6287, 'grad_norm': 5.275486469268799, 'learning_rate': 2.3628691983122365e-05, 'epoch': 1.58}


100%|██████████| 948/948 [45:39<00:00,  2.89s/it]

{'train_runtime': 2739.4142, 'train_samples_per_second': 0.692, 'train_steps_per_second': 0.346, 'train_loss': 2.5211611719574103, 'epoch': 3.0}





TrainOutput(global_step=948, training_loss=2.5211611719574103, metrics={'train_runtime': 2739.4142, 'train_samples_per_second': 0.692, 'train_steps_per_second': 0.346, 'total_flos': 247709319561216.0, 'train_loss': 2.5211611719574103, 'epoch': 3.0})

In [20]:
def format_response(response):
    # Extract the last complete sentence
    sentence_endings = ['.', '!', '?']
    for ending in sentence_endings:
        index = response.rfind(ending)
        if index != -1:
            response = response[:index + 1]
            break
    return response

def fetch_answer(query, max_len=256, temp=0.7, top_k_val=50, top_p_val=0.95):
    prompt_text = f"Question: {query}\nAnswer:"
    encoded_input = tokenizer_gpt(prompt_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        generated_output = model_gpt.generate(
            encoded_input["input_ids"],
            max_length=max_len,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=top_k_val,
            top_p=top_p_val,
            temperature=temp,
            pad_token_id=tokenizer_gpt.eos_token_id,
        )

    response_text = tokenizer_gpt.decode(generated_output[0], skip_special_tokens=True)
    trimmed_response = response_text[len(prompt_text):]  # Remove the prompt from the response
    return format_response(trimmed_response)

In [21]:
query_text = "I have a headache, what could it be?"
response_text = fetch_answer(query_text)
print(f"Response: {response_text}")

Response:  This is a rare, but common symptom. The symptoms usually come back as a mild headache.   - Mild headache  The headache lasts less than 1 1 week. Sometimes the headache starts for months or years. This may include  cold, hot, or hot days, if the symptoms are severe. If the headaches are not treated, the doctor may prescribe medicine to treat the problem. In addition, your doctor can prescribe an antiviral medicine. Medicines may help prevent the disease from spreading. Other types of medicines may also include:  Gastrointestinal  Swelling  Some people with this disorder may have diarrhea, diarrhea or vomiting. Others may need to be treated. Many people may not have any symptoms. Some may be more common. For example, a high fever may occur. It may happen within 3 to 4 weeks of the rash. Symptoms can start when the skin is dry and doesn't go away. Your doctor will talk with you about your symptoms and your signs and symptoms before making a decision. Ask a doctor about medicine

In [22]:
model_gpt.save_pretrained("./models/myModelgpt2")
tokenizer_gpt.save_pretrained("./models/myModelgpt2")

('./models/myModelgpt2/tokenizer_config.json',
 './models/myModelgpt2/special_tokens_map.json',
 './models/myModelgpt2/vocab.json',
 './models/myModelgpt2/merges.txt',
 './models/myModelgpt2/added_tokens.json')