## Read the file

In [2]:
file_path = "../data/private/FT.txt"
with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

len(lines)

72591

## Clean the conversation

In [3]:
import re

encryption_message = "Messages and calls are end-to-end encrypted. Only people in this chat can read, listen to, or share them."
media_pattern = "<media omitted>"
image_pattern = "image omitted"
audio_pattern = "audio omitted"
sticker_pattern = "sticker omitted"
video_pattern = "video omitted"
document_pattern = "document omitted"
email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
edited_message = "<This message was edited>"
deleted_message = "You deleted this message"
null_message = "null"
created_group_message = "created group"
added_you_to_group_message = "added you"
tagging_pattern = r'@[\w]+'


filtered_lines = []
for line in lines:
    # ... (tu lógica de filtrado if/else sigue igual) ...
    if (
        encryption_message not in line and
        deleted_message not in line and
        null_message != line.split(" ")[-1] and image_pattern not in line and
        audio_pattern not in line and
        sticker_pattern not in line and
        video_pattern not in line and
        document_pattern not in line and
        media_pattern not in line and
        created_group_message not in line and
        added_you_to_group_message not in line and
        not re.search(email_pattern, line) and
        not re.search(url_pattern, line)
    ):
        line = line.replace(edited_message, "").strip()
        line = re.sub(tagging_pattern, "", line).strip()
        filtered_lines.append(line)

# Unimos todo el texto filtrado
content = '\n'.join(filtered_lines)

pattern = r'\[(\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2})\] (.*?): (.*?)(?=\n\[|$)'

messages = re.findall(pattern, content, re.DOTALL)

print(f"Mensajes encontrados: {len(messages)}")

# Verificación visual rápida
if len(messages) > 0:
    print("Ejemplo del primer mensaje:", messages[0])

Mensajes encontrados: 52740
Ejemplo del primer mensaje: ('19/07/19, 15:32:42', 'Eduardo Lomeli', 'Yeyis')


## Create the dataset

### 1. Group messages by sender

If a conversation is structured as follows:  

```
User 1: Hey!  
User 1: How are you?  
User 2: I am fine  
User 2: And you?  
User 1: Good.  
```

We want to transform it into:  

```
User 1: Hey!\nHow are you? 
User 2: I am fine\nAnd you?  
User 1: Good  
```

In [4]:
grouped_messages = []

for _, sender, message in messages:
    if grouped_messages and grouped_messages[-1]["sender"] == sender:
        grouped_messages[-1]["message"] += "\n" + message
    else:
        grouped_messages.append({
            "sender": sender,
            "message": message
        })

len(grouped_messages)

27928

### 2. Include special tokens

Each message follows this format:  
```
<|startoftext|>Sender<|separator|>Message<|endoftext|>
```

In [5]:
# Define special tokens
start_of_text_token = "<|startoftext|>"
end_of_text_token = "<|endoftext|>"
separator_token = "<|separator|>"

fine_tuning_data = []

for message in grouped_messages:
    sender = message["sender"]
    message_text = message["message"]
    input_sequence = f"{start_of_text_token}{sender}{separator_token}{message_text}{end_of_text_token}"
    fine_tuning_data.append(input_sequence)

len(fine_tuning_data)

27928

### 3. Save the data

In [6]:
import json

save_path = "../output/fine_tuning/data/fine_tuning.json"
with open(save_path, 'w', encoding='utf-8') as f:
    json.dump(fine_tuning_data, f, ensure_ascii=False, indent=4)