### This notebook is for analyzing the steps during processing data. It contains a lot more documentation and code than the original script. 

In [None]:
import numpy as np
import pandas as pd
import os
import time

import re
import nltk
import string

In [None]:
keys_to_filter = os.getenv('KEYS_TO_FILTER').split(',')
concatenated_path = os.getenv('CONCATENATED_PATH')
dataset_path = "Datasets/concatenated.csv"

In [None]:
dataset = pd.read_csv(dataset_path)
dataset = pd.DataFrame(dataset)
dataset.head(100)

In [None]:
def remove_urls(text):
      return re.sub(r'http\S+', 'redacted', text)
# For non-english datasets
def remove_english_words(text):
    # Looks for all English words and removes them.
    pattern = r'\b[a-zA-Z]+\b'
    return re.sub(pattern, '', text)
def delete_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text
def remove_mention(text):
  mention_regex = r"@\w+"
  return re.sub(mention_regex, "/mention", text)
def redact_email(text): 
    return re.sub(r'\S+@\S+', '/email', text)
# def remove_password(text): 
#     copy_text = text
#     pass_pattern = r'[A-Za-z0-9@#$%^&+=]{8,}'
#     text_ = re.sub(pass_pattern, '', text)
#     return text_
def remove_whitespace(text):
    return  " ".join(text.split())
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
def sen_len_threshold(text, char_min=16, char_limit=512): # Can be used for better tuning. 
    text = str(text)
    # Removes sentences if between char_min and char_limit.
    clean_text = text if char_min <= len(text) <= char_limit else None
    return clean_text

In [None]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, ' ', data)

In [None]:
def filter_sensitive_words(sentence, replacement='CENSORED', keys_to_filter=keys_to_filter):
    """
    Create a list of sensitive words 'keys_to_filter' from .env file 
    Replaces sensitive for you words with 'CENSORED'

    Parameters: 
        sentence 
        replacement: str = words that will be substituted instead of the sensitive words   
    """
    words = set(keys_to_filter)
    sentence_words = sentence.split()
    
    modified_sentence = [
        replacement if word in words else word for word in sentence_words
    ]
    
    # Join the list back into a sentence
    return ' '.join(modified_sentence)
    
    return sentence

In [None]:
# Since " " rows don't count as NAN, we should identify them by ourselves.
def drop_space_rows(df: pd.DataFrame) -> pd.DataFrame:
      """Identifies and drops ' ' rows in the DataFrame"""
      space_rows = df['Message'] == ' '
      df_filtered = df[~pd.Series(space_rows)].reset_index(drop=True)

      return df_filtered

In [None]:
def preprocess_data(text):
      text = remove_english_words(text)
      text = redact_email(text)
      text = remove_urls(text)
      text = remove_mention(text)
      text = delete_html_tags(text)
      text = filter_sensitive_words(text)
      text = remove_whitespace(text)
      
      return text

In [None]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    import time 
    dataset_copy = df.copy()
    start_time= time.time()
    df['Message'] = df['Message'].apply(preprocess_data)
    df["Message"] = df["Message"].apply(lambda x: remove_emojis(str(x)) if isinstance(x, str) else ' ')
    df = drop_space_rows(df)
    df.to_csv(concatenated_path, index=False)
    end_time = time.time()

    total_time = end_time - start_time
    print(f"Total time for processing: {total_time:.2f} seconds")

    
    return df

In [None]:
dataset_copy = dataset.copy() # For visual purposes
dataset = preprocess_dataset(dataset)

b_length = len(dataset_copy)
a_length = len(dataset)
b_mean_length = np.mean(dataset_copy['Message'].str.len())
a_mean_length = np.mean(dataset['Message'].str.len())
b_max_length = np.max(dataset_copy['Message'].str.len())
a_max_length = np.max(dataset['Message'].str.len())
longest_sentence_index = dataset['Message'].str.len().idxmax()
longest_sentence = dataset['Message'].iloc[longest_sentence_index]


print(f"Changes (Before/After) processing:")
print(f"Length: {b_length} -> {a_length}")
print(f"Median length: {b_mean_length:.2f} -> {a_mean_length:.2f}")
print(f"Max sentence length: {b_max_length} -> {a_max_length}")
print(f"Nan values: {dataset_copy.isna().sum().sum()} -> {dataset.isna().sum().sum()}")
print(f"Longest sentence: {len(longest_sentence)} chars: {longest_sentence}")

del dataset_copy

### Diving into Question / Answer 

In [None]:
"""
Creating a column with time difference between messages 
To correctly assign the context.
"""
dataset = dataset.sort_values(by=['Date']).reset_index(drop=True)

dataset['Date'] = pd.to_datetime(dataset['Date'], format='ISO8601')

reference_time = dataset['Date'].min()
dataset['time_diff_seconds'] = dataset['Date'] - reference_time
# Converts into hours difference
dataset['time_diff_seconds'] = dataset['time_diff_seconds'].apply(lambda x: int(x.total_seconds()))
dataset

In [None]:
def separate_sentences(df: pd.DataFrame) -> pd.DataFrame:
      """
      Takes a pandas dataframe with a messages column and returns separated rows with question / answer columns
      Args: 
            dataset: pd.DataFrame
            Dataset should contain a messages column and first row with identification who sent a message.



      Returns:
            dataset: pd.DataFrame
            
            Dataset divided into question / answer columns.
      """

      separated_dataset = pd.DataFrame(columns=['question', 'answer', 'timestamp', 'Sent_by_me', 'time_diff_seconds'])

      # Make the first row the first question (All questions become even, answers->odds)
      if df["Sent_by_me"].iloc[0]: 
            df = df.drop(df.index[0]).reset_index(drop=True)

      questions_df = df[df.index % 2 == 0].reset_index(drop=True)
      answers_df = df[df.index % 2 == 1].reset_index(drop=True)

      min_length = min(len(questions_df), len(answers_df))

      separated_dataset = pd.concat(
     [
        questions_df["Message"][:min_length].rename("question"),
        answers_df["Message"][:min_length].rename("answer"),
        df["Date"][:min_length].rename("timestamp"),
        df["Sent_by_me"][:min_length].rename("Sent_by_me"),
        df["time_diff_seconds"][:min_length].rename("time_diff_seconds")
     ], axis=1
)

      return separated_dataset

In [None]:
separated_dataset = separate_sentences(dataset)
del dataset

In [None]:
separated_dataset

### Adding column with previous context 

In [None]:
import pandas as pd
import numpy as np

def add_context(df: pd.DataFrame, context_size: int = 20) -> pd.DataFrame:
    """
    Add a column with previous context to the DataFrame.
    
    The context is based on the previous messages. If the time difference 
    between messages is more than 2 hours, it's considered the start of a 
    new conversation, and the first row of that new conversation will have 
    no context. Subsequent messages in the conversation will have context.
    """
    
    context_list = []
    last_time = None  # Track the last message time to determine time gaps
    
    for index in range(len(df)):
        if index == 0:
            # No context for the very first message
            context_list.append(None)
            last_time = df.loc[index, "time_diff_seconds"]
            continue
        
        # Calculate the time difference from the previous row
        time_diff = df.loc[index, "time_diff_seconds"] - last_time
        last_time = df.loc[index, "time_diff_seconds"]

        # If time_diff is more than 6 hours, consider it a new conversation
        if time_diff > 21600:
            context_list.append(None)  # Start of a new conversation, no context
        else:
            # Create context from the previous messages within the context size
            start_index = max(index - context_size, 0)
            context = df.loc[start_index:index - 1, ["question", "answer"]]

            # Build the context string from previous rows
            message = []
            for key, (question, answer) in enumerate(zip(context["question"], context["answer"])):
                message.append(f"Q{key + 1}: {question}. A{key + 1}: {answer} || ")

            # Append the concatenated message as the context
            context_list.append(" ".join(message))

    # Handle 1st row None (diff seconds in 0 index is 0, then 1 is None).
    context = df.loc[0, ["question", "answer"]]
    question, answer = context["question"], context["answer"]
    context_list[1] = (f"Q{1}: {question}. A{1}: {answer} || ")
    
    # Add the context as a new column
    df["context"] = context_list

    # Replace any empty or missing contexts with "Missing Context" if desired
    df["context"] = df["context"].apply(lambda x: "Time Gap" if pd.isna(x) else x)
    
    return df

In [None]:
separated_dataset = add_context(separated_dataset)
separated_dataset

In [None]:
total_time_gaps = (separated_dataset["context"] == "Time Gap").sum()
total_time_gaps

# Data Augmentation 
and continue of processing

Inspired by: 
https://github.com/jasonwei20/eda_nlp

Methods: 
1. Back-translation
2. Synonym replacement
3. Word Swap
4. Sentence shuffle

Remember that this notebook is designed to work with ukrainian language dataset, and not all techniques will work for English language.

In [None]:
dataset = separated_dataset.copy()

In [None]:
import random
import pandas as pd

def remove_double_commas(text: str) -> str:
    """Removes double commas from the text."""
    return text.replace(",,", ",")

def split_sentences(text: str) -> list:
    """Splits the text into sentences by commas, handling empty strings gracefully."""
    return [sentence.strip() for sentence in text.split(',') if sentence.strip()]

def shuffle_sentence(text: str) -> str:
    """
    Removes double commas, splits the text into sentences, shuffles them,
    and joins them back into a shuffled sentence.
    """
    # Step 1: Clean and split the sentences
    clean_text = remove_double_commas(text)
    sentences = split_sentences(clean_text)

    # Step 2: Shuffle the sentences
    random.shuffle(sentences)

    # Step 3: Join shuffled sentences back into a single string
    return ", ".join(sentences)

Click <b>here</b> for shuffle example.

<!--
text = "This is a test,, sentence, another part,, and more text. Це просто тест, такий вот тест"
shuffled_text = shuffle_sentence(text)

print(f"Before: {text}")
print(f"After: {shuffled_text}")

Outputs: 
Before: This is a test,, sentence, another part,, and more text. Це просто тест, такий вот тест
After: and more text. Це просто тест, another part, This is a test, sentence, такий вот тест
-->

In [None]:
# Helper function to download data for a language
from transformers import MarianMTModel, MarianTokenizer

def download(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# download model for English -> Ukrainian
first_tokenizer, first_model = download('Helsinki-NLP/opus-mt-uk-en')
# download model for Ukrainian -> English
second_tokenizer, second_model = download('Helsinki-NLP/opus-mt-en-uk')

In [None]:
def format_batch_texts(language_code, batch_texts):
    formated_batch = [f">>{language_code}<< {batch_texts}" for sentence in batch_texts]

    return formated_batch

def translate(batch_texts, model, tokenizer, language):
    """Translate texts into a target language"""
    # Format the text as expected by the model
    batched_text = format_batch_texts(language, batch_texts)

    # Translate
    translated = [model.generate(**tokenizer(batch_texts, return_tensors="pt", padding=True)) for sentence in batched_text]

    # Decode (tokens to text)
    translated_texts = tokenizer.batch_decode(translated[0], skip_special_tokens=True)

    return translated_texts

def back_translate(texts, from_language="uk", to_language = "en"):
    """Implements back translation"""
    # Translate from source to target language
    if from_language == "en":
        translated = translate(texts, second_model, second_tokenizer, from_language)
        back_translated = translate(translated, first_model, first_tokenizer, to_language)
    else: 
        translated = translate(texts, first_model, first_tokenizer, from_language)
        back_translated = translate(translated, second_model, second_tokenizer, to_language)


    return back_translated

Click <b>here</b> for back-translation example.

<!--
# Perform back-translation (Ukrainian to English to Ukrainian)
texts = ["Це перше речення яке ти маєш перекласти.",
         "Воно є дуже просте та правильно сформульованею."]
back_translated_texts = back_translate(texts)
texts = ["This is the first sentence you should translate", 
        "It is simple and correctly formulated"]
back_translated_texts_en = back_translate(texts, "en", "uk")

# Print the results
print("Original Text:", texts)
print("Back-Translated Text:", back_translated_texts)
print("-----------------")
print("Original Text:", texts)
print("Back-Translated Text:", back_translated_texts_en)

Outputs:
Original Text: ['This is the first sentence you should translate', 'It is simple and correctly formulated']
Back-Translated Text: ['Це перше речення, яке ви маєте перекласти.', 'Вона дуже проста і добре сформульована.']
-----------------
Original Text: ['This is the first sentence you should translate', 'It is simple and correctly formulated']
Back-Translated Text: ['This is the first sentence you have to translate.', "It's simple and correctly formulated."]
-->

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
# Synonyms using word embeddings (word2vec)

In [None]:
### Creating augmentation rows and concatenating them with dataset.

In [None]:
def augment_data(df, augmentation_factor=3):
    # Add new columns with augmented data

In [None]:
def pop_word(sentence):
    """Pops a random word from the sentence"""
    words = sentence.split()
    total_words = len(words)

    random_probs = np.random.rand(total_words)
    p = random_probs / np.sum(random_probs)  # Normalize to create valid probabilities

    chosen_word_index = np.random.choice(len(words), size=1, replace=False, p=p)[0]

    chosen_word = words.pop(chosen_word_index)

    return " ".join(words)

# Example: 
