### This notebook is for analyzing the steps during processing data. It contains a lot more documentation and code than the original script. 

In [None]:
import numpy as np
import pandas as pd
import os
import time

import re
import nltk

import string

In [None]:
if not os.path.exists("Datasets/stopwords_ua_set.txt"):
      !wget -P"Datasets/" https://raw.githubusercontent.com/skupriienko/Ukrainian-Stopwords/refs/heads/master/stopwords_ua_set.txt

with open('Datasets/stopwords_ua_set.txt', 'r') as file:
    ukrainian_stop_words = file.read().splitlines()[0]

keys_to_filter = os.getenv('KEYS_TO_FILTER').split(',')
concatenated_path = os.getenv('CONCATENATED_PATH')
dataset_path = "Datasets/concatenated.csv"

In [None]:
dataset = pd.read_csv(dataset_path)
dataset = pd.DataFrame(dataset)
dataset.head(100)

In [None]:
def remove_urls(text):
      return re.sub(r'http\S+', 'redacted', text)
# For non-english datasets
def remove_english_words(text):
    # Looks for all English words and removes them.
    pattern = r'\b[a-zA-Z]+\b'
    return re.sub(pattern, '', text)
def delete_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text
def remove_mention(text):
  mention_regex = r"@\w+"
  return re.sub(mention_regex, "/mention", text)
def redact_email(text): 
    return re.sub(r'\S+@\S+', '/email', text)
# def remove_password(text): 
#     copy_text = text
#     pass_pattern = r'[A-Za-z0-9@#$%^&+=]{8,}'
#     text_ = re.sub(pass_pattern, '', text)
#     return text_
def remove_whitespace(text):
    return  " ".join(text.split())
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
def sen_len_threshold(text, char_min=16, char_limit=512): # Can be used for better tuning. 
    text = str(text)
    # Removes sentences if between char_min and char_limit.
    clean_text = text if char_min <= len(text) <= char_limit else None
    return clean_text

In [None]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, ' ', data)

In [None]:
def filter_sensitive_words(sentence, replacement='CENSORED', keys_to_filter=keys_to_filter):
    """
    Create a list of sensitive words 'keys_to_filter' from .env file 
    Replaces sensitive for you words with 'CENSORED'

    Parameters: 
        sentence 
        replacement: str = words that will be substituted instead of the sensitive words   
    """
    words = set(keys_to_filter)
    sentence_words = sentence.split()
    
    modified_sentence = [
        replacement if word in words else word for word in sentence_words
    ]
    
    # Join the list back into a sentence
    return ' '.join(modified_sentence)
    
    return sentence

In [None]:
# Since " " rows don't count as NAN, we should identify them by ourselves.
def drop_space_rows(df: pd.DataFrame) -> pd.DataFrame:
      """Identifies and drops ' ' rows in the DataFrame"""
      space_rows = df['Message'] == ' '
      df_filtered = df[~pd.Series(space_rows)].reset_index(drop=True)

      return df_filtered

In [None]:
def preprocess_data(text):
      text = remove_english_words(text)
      text = redact_email(text)
      text = remove_urls(text)
      text = remove_mention(text)
      text = delete_html_tags(text)
      text = filter_sensitive_words(text)
      text = remove_whitespace(text)
      
      return text

In [None]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    import time 
    dataset_copy = df.copy()
    start_time= time.time()
    df['Message'] = df['Message'].apply(preprocess_data)
    df["Message"] = df["Message"].apply(lambda x: remove_emojis(str(x)) if isinstance(x, str) else ' ')
    df = drop_space_rows(df)
    df.to_csv(concatenated_path, index=False)
    end_time = time.time()

    total_time = end_time - start_time
    print(f"Total time for processing: {total_time:.2f} seconds")

    
    return df

In [None]:
dataset_copy = dataset.copy() # For visual purposes
dataset = preprocess_dataset(dataset)

b_length = len(dataset_copy)
a_length = len(dataset)
b_mean_length = np.mean(dataset_copy['Message'].str.len())
a_mean_length = np.mean(dataset['Message'].str.len())
b_max_length = np.max(dataset_copy['Message'].str.len())
a_max_length = np.max(dataset['Message'].str.len())
longest_sentence_index = dataset['Message'].str.len().idxmax()
longest_sentence = dataset['Message'].iloc[longest_sentence_index]


print(f"Changes (Before/After) processing:")
print(f"Length: {b_length} -> {a_length}")
print(f"Median length: {b_mean_length:.2f} -> {a_mean_length:.2f}")
print(f"Max sentence length: {b_max_length} -> {a_max_length}")
print(f"Nan values: {dataset_copy.isna().sum().sum()} -> {dataset.isna().sum().sum()}")
print(f"Longest sentence: {len(longest_sentence)} chars: {longest_sentence}")

del dataset_copy

### Diving into Question / Answer 

In [None]:
"""
Creating a column with time difference between messages 
To correctly assign the context.
"""
dataset = dataset.sort_values(by=['Date']).reset_index(drop=True)

dataset['Date'] = pd.to_datetime(dataset['Date'], format='ISO8601')

reference_time = dataset['Date'].min()
dataset['time_diff_seconds'] = dataset['Date'] - reference_time
# Converts into hours difference
dataset['time_diff_seconds'] = dataset['time_diff_seconds'].apply(lambda x: int(x.total_seconds()))
dataset

In [None]:
def separate_sentences(df: pd.DataFrame) -> pd.DataFrame:
      """
      Takes a pandas dataframe with a messages column and returns separated rows with question / answer columns
      Args: 
            dataset: pd.DataFrame
            Dataset should contain a messages column and first row with identification who sent a message.



      Returns:
            dataset: pd.DataFrame
            
            Dataset divided into question / answer columns.
      """

      separated_dataset = pd.DataFrame(columns=['question', 'answer', 'timestamp', 'Sent_by_me', 'time_diff_seconds'])

      # Make the first row the first question (All questions become even, answers->odds)
      if df["Sent_by_me"].iloc[0]: 
            df = df.drop(df.index[0]).reset_index(drop=True)

      questions_df = df[df.index % 2 == 0].reset_index(drop=True)
      answers_df = df[df.index % 2 == 1].reset_index(drop=True)

      min_length = min(len(questions_df), len(answers_df))

      separated_dataset = pd.concat(
     [
        questions_df["Message"][:min_length].rename("question"),
        answers_df["Message"][:min_length].rename("answer"),
        df["Date"][:min_length].rename("timestamp"),
        df["Sent_by_me"][:min_length].rename("Sent_by_me"),
        df["time_diff_seconds"][:min_length].rename("time_diff_seconds")
     ], axis=1
)

      return separated_dataset

In [None]:
dataset = separate_sentences(dataset)


In [None]:
dataset.head(100)

### Adding column with previous context 

In [None]:
import pandas as pd
import numpy as np

def add_context(df: pd.DataFrame, context_size: int = 20) -> pd.DataFrame:
    """
    Add a column with previous context to the DataFrame.
    
    The context is based on the previous messages. If the time difference 
    between messages is more than 2 hours, it's considered the start of a 
    new conversation, and the first row of that new conversation will have 
    no context. Subsequent messages in the conversation will have context.
    """
    
    context_list = []
    last_time = None  # Track the last message time to determine time gaps
    
    for index in range(len(df)):
        if index == 0:
            # No context for the very first message
            context_list.append(None)
            last_time = df.loc[index, "time_diff_seconds"]
            continue
        
        # Calculate the time difference from the previous row
        time_diff = df.loc[index, "time_diff_seconds"] - last_time
        last_time = df.loc[index, "time_diff_seconds"]

        # If time_diff is more than 6 hours, consider it a new conversation
        if time_diff > 21600:
            context_list.append(None)  # Start of a new conversation, no context
        else:
            # Create context from the previous messages within the context size
            start_index = max(index - context_size, 0)
            context = df.loc[start_index:index - 1, ["question", "answer"]]

            # Build the context string from previous rows
            message = []
            for key, (question, answer) in enumerate(zip(context["question"], context["answer"])):
                message.append(f"Q{key + 1}: {question}. A{key + 1}: {answer} || ")

            # Append the concatenated message as the context
            context_list.append(" ".join(message))

    # Handle 1st row None (diff seconds in 0 index is 0, then 1 is None).
    context = df.loc[0, ["question", "answer"]]
    question, answer = context["question"], context["answer"]
    context_list[1] = (f"Q{1}: {question}. A{1}: {answer} || ")
    
    # Add the context as a new column
    df["context"] = context_list

    # Replace any empty or missing contexts with "Missing Context" if desired
    df["context"] = df["context"].apply(lambda x: "Time Gap" if pd.isna(x) else x)
    
    return df

In [None]:
dataset = add_context(dataset)
dataset.head(100)

In [None]:
total_time_gaps = (dataset["context"] == "Time Gap").sum()
total_time_gaps

# Data Augmentation 
and continue of processing

Resources: 
https://arxiv.org/pdf/1901.11196

Methods: 
1. Back-translation
2. Synonym replacement
3. Word Swap
4. Sentence shuffle

Remember that this notebook is designed to work with ukrainian language dataset, and not all techniques will work for English language.

### Sentence shuffling

In [None]:
import random
import pandas as pd

def remove_double_commas(text: str) -> str:
    """Removes double commas from the text."""
    return text.replace(",,", ",")

def split_sentences(text: str) -> list:
    """Splits the text into sentences by commas, handling empty strings gracefully."""
    return [sentence.strip() for sentence in text.split(',') if sentence.strip()]

def shuffle_sentence(text: str) -> str:
    """
    Removes double commas, splits the text into sentences, shuffles them,
    and joins them back into a shuffled sentence.
    """
    # Step 1: Clean and split the sentences
    clean_text = remove_double_commas(text)
    sentences = split_sentences(clean_text)

    # Step 2: Shuffle the sentences
    random.shuffle(sentences)

    # Step 3: Join shuffled sentences back into a single string
    return ", ".join(sentences)

Click <b>here</b> for shuffle example.

<!--
text = "This is a test,, sentence, another part,, and more text. Це просто тест, такий вот тест"
shuffled_text = shuffle_sentence(text)

print(f"Before: {text}")
print(f"After: {shuffled_text}")

Outputs: 
Before: This is a test,, sentence, another part,, and more text. Це просто тест, такий вот тест
After: and more text. Це просто тест, another part, This is a test, sentence, такий вот тест
-->

### Back-translation using MarianMTModel
**Not unilizing in the project because of the slow generation time**

Click <b>here</b> to see MarianMTModel


<!--
# Helper function to download data for a language
from transformers import MarianMTModel, MarianTokenizer

def download(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# download model for English -> Ukrainian
first_tokenizer, first_model = download('Helsinki-NLP/opus-mt-uk-en')
# download model for Ukrainian -> English
second_tokenizer, second_model = download('Helsinki-NLP/opus-mt-en-uk')

def format_batch_texts(language_code, batch_text):
    formated_batch = [f">>{language_code}<< {batch_text}"]

    return formated_batch

def translate(batch_texts, model, tokenizer, language):
    """Translate texts into a target language"""
    # Format the text as expected by the model
    batched_text = format_batch_texts(language, batch_texts)

    # Translate
    translated = [model.generate(**tokenizer(batch_texts, return_tensors="pt", padding=True)) for sentence in batched_text]

    # Decode (tokens to text)
    translated_texts = tokenizer.batch_decode(translated[0], skip_special_tokens=True)

    return translated_texts

def back_translate(texts, from_language="uk", to_language = "en"):
    """Implements back translation"""
    # Translate from source to target language
    if from_language == "uk": 
        translated = translate(texts, first_model, first_tokenizer, from_language)
        back_translated = translate(translated, second_model, second_tokenizer, to_language)[0]
        return back_translated
    
    translated = translate(texts, second_model, second_tokenizer, from_language)
    back_translated = translate(translated, first_model, first_tokenizer, to_language)[0]

    return back_translated
--->

Click <b>here</b> for back-translation example with MarianMTModel

<!--
# Perform back-translation (Ukrainian to English to Ukrainian)
texts = ["Це перше речення яке ти маєш перекласти.",
         "Воно є дуже просте та правильно сформульованею."]
back_translated_texts = back_translate(texts)
texts = ["This is the first sentence you should translate", 
        "It is simple and correctly formulated"]
back_translated_texts_en = back_translate(texts, "en", "uk")

# Print the results
print("Original Text:", texts)
print("Back-Translated Text:", back_translated_texts)
print("-----------------")
print("Original Text:", texts)
print("Back-Translated Text:", back_translated_texts_en)

Outputs:
Original Text: ['This is the first sentence you should translate', 'It is simple and correctly formulated']
Back-Translated Text: ['Це перше речення, яке ви маєте перекласти.', 'Вона дуже проста і добре сформульована.']
-----------------
Original Text: ['This is the first sentence you should translate', 'It is simple and correctly formulated']
Back-Translated Text: ['This is the first sentence you have to translate.', "It's simple and correctly formulated."]
-->

### Swap and Pop words

In [None]:
def swap_word(sentence): 
    """Swaps two random words in the sentence"""
    words = sentence.split()
    if len(words) < 2:
        return sentence

    idx1, idx2 = np.random.choice(len(words), size=2, replace=False)
    words[idx1], words[idx2] = words[idx2], words[idx1]

    return " ".join(words)

In [None]:
def filter_stopwords(sentence, stop_words=ukrainian_stop_words) -> str:
    """ Returns two lists: words with stopwords and words without stopwords"""
    words = sentence.split()
    filtered_stopwords = [word for word in sentence.split() if word.lower() not in stop_words]
    return words, filtered_stopwords

def pop_word(sentence, word_swap: bool = False):
    """Pops a random word from the sentence"""

    words, stop_words = filter_stopwords(sentence)

    if stop_words: 
        remove_index = np.random.choice(stop_words, size=1, replace=False)[0]
        words.remove(remove_index)
    else: 
        return sentence


    return " ".join(words)


Click <b>here</b> for word elimination example.

<!--
# Example: 

example = "Це є експериментальним реченням. Воно прикольне))" # Stopwords work only with ukrainian language.
example = pop_word(example)

print(example)

Outputs: 
"is a random sentence"
-->

Click <b>here</b> for word swap example.

<!--
# Example: 

example = "This is a random sentence"
example = swap_word(example)

print(example)
-->

### Back translation and Synonym Replacement

In [None]:
LANG_CODES = {
    'afrikaans': 'af',
    'albanian': 'sq',
    'amharic': 'am',
    'arabic': 'ar',
    'armenian': 'hy',
    'azerbaijani': 'az',
    'basque': 'eu',
    'belarusian': 'be',
    'bengali': 'bn',
    'bosnian': 'bs',
    'bulgarian': 'bg',
    'catalan': 'ca',
    'cebuano': 'ceb',
    'chichewa': 'ny',
    'chinese (simplified)': 'zh-cn',
    'chinese (traditional)': 'zh-tw',
    'corsican': 'co',
    'croatian': 'hr',
    'czech': 'cs',
    'danish': 'da',
    'dutch': 'nl',
    'english': 'en',
    'esperanto': 'eo',
    'estonian': 'et',
    'filipino': 'tl',
    'finnish': 'fi',
    'french': 'fr',
    'frisian': 'fy',
    'galician': 'gl',
    'georgian': 'ka',
    'german': 'de',
    'greek': 'el',
    'gujarati': 'gu',
    'haitian creole': 'ht',
    'hausa': 'ha',
    'hawaiian': 'haw',
    'hebrew': 'he',
    'hindi': 'hi',
    'hmong': 'hmn',
    'hungarian': 'hu',
    'icelandic': 'is',
    'igbo': 'ig',
    'indonesian': 'id',
    'irish': 'ga',
    'italian': 'it',
    'japanese': 'ja',
    'javanese': 'jw',
    'kannada': 'kn',
    'kazakh': 'kk',
    'khmer': 'km',
    'korean': 'ko',
    'kurdish (kurmanji)': 'ku',
    'kyrgyz': 'ky',
    'lao': 'lo',
    'latin': 'la',
    'latvian': 'lv',
    'lithuanian': 'lt',
    'luxembourgish': 'lb',
    'macedonian': 'mk',
    'malagasy': 'mg',
    'malay': 'ms',
    'malayalam': 'ml',
    'maltese': 'mt',
    'maori': 'mi',
    'marathi': 'mr',
    'mongolian': 'mn',
    'myanmar (burmese)': 'my',
    'nepali': 'ne',
    'norwegian': 'no',
    'odia': 'or',
    'pashto': 'ps',
    'persian': 'fa',
    'polish': 'pl',
    'portuguese': 'pt',
    'punjabi': 'pa',
    'romanian': 'ro',
    'russian': 'ru',
    'samoan': 'sm',
    'scots gaelic': 'gd',
    'serbian': 'sr',
    'sesotho': 'st',
    'shona': 'sn',
    'sindhi': 'sd',
    'sinhala': 'si',
    'slovak': 'sk',
    'slovenian': 'sl',
    'somali': 'so',
    'spanish': 'es',
    'sundanese': 'su',
    'swahili': 'sw',
    'swedish': 'sv',
    'tajik': 'tg',
    'tamil': 'ta',
    'telugu': 'te',
    'thai': 'th',
    'turkish': 'tr',
    'ukrainian': 'uk',
    'urdu': 'ur',
    'uyghur': 'ug',
    'uzbek': 'uz',
    'vietnamese': 'vi',
    'welsh': 'cy',
    'xhosa': 'xh',
    'yiddish': 'yi',
    'yoruba': 'yo',
    'zulu': 'zu'}

LANGUAGES = {value:key for key, value in LANG_CODES.items()}

In [None]:
from googletrans import Translator
import gensim.downloader as api


class google_translate:
    """
    Performs Google Translate on a given text.

    Args:
        translate_from (str): The natural language of the text. Defaults to "uk". Contains auto language detection.
        translate_to (str): The language to translate to and back from. Defaults to "en".
    """

    def __init__(self, translate_from: str = "uk", translate_to: str = "en", replace_synonyms: bool = False):
        self.native_language = translate_from
        self.tunnel_language = translate_to 
        self.translator = Translator()

        if replace_synonyms:
            self.word2vec_model = self.install_word2vec()

    """ Back-translation """
    # Check whether the language input is correct
    def check_language(self, text):

        if self.native_language not in LANGUAGES:  
            self.native_language = self.translator.detect(text).lang
            print(f"Incorrect language. Translating from '{self.native_language}'")

            # If the back-translation is going on English text, the text will be translated from English to Spanish and back to English.
            if self.native_language == "en": 
                self.tunnel_language == "es"

                
    def back_translate(self, text, replace_synonym: bool = True) -> str:
        """
        Performs back-translation on a given text.

        Args:
            text (str): The text to back-translate.
            temp_lang (str): The intermediate language for translation. Defaults to French ("fr").

        Returns:
            str: The back-translated text.
        """
        translator = self.translator

        self.check_language(text=text)

        translated = self.translator.translate(text, src=self.native_language, dest=self.tunnel_language).text
        
        if replace_synonym: 
            #translated = self.synonym_replacement(sentence=translated) # TODO: Add English stopwords.
            pass

        back_translated = translator.translate(translated, src=self.tunnel_language, dest=self.native_language).text

        return back_translated
    
    """ Synonym extension (Word2Vec) """

    def install_word2vec(self):
      model_name = "word2vec-google-news-300"
      print(f"Configuring {model_name}")
      word2vec_model = api.load(model_name)

      return word2vec_model

    def synonym_replacement(self, sentence):
        # Remove stopwords 
        words, filtered_sentence = filter_stopwords(sentence)
        random_word_index = np.random.choice(len(filtered_sentence), size=1)[0]
        synonym = self.word2vec_model.most_similar(filtered_sentence[random_word_index], topn=1)[0][0] # Top 5 most similar words
        words[random_word_index] = synonym    

        return " ".join(words)


Click <b>here</b> to see back-translation example.


<!--
translator = google_translate(translate_from="uk", translate_to="en")
back_translated = translator.back_translate("Привіт, як воно?") # Hello, how is it going?
back_translated
--->

### Creating augmentation rows and concatenating them with dataset.

In [None]:
translator = google_translate(translate_from="uk", translate_to="en", replace_synonyms=True)
def apply_augmentation(sentence) -> pd.DataFrame:
    sentence = translator.back_translate(sentence)
    sentence = shuffle_sentence(sentence)
    sentence = swap_word(sentence)
    sentence = pop_word(sentence)
    # sentence = synonym_replacement(sentence)

    return sentence

def speed_test(df, size=100):
    start_time = time.time()
    df["question"] = df["question"].loc[:size].apply(lambda x: apply_augmentation(x))
    print("--- %s seconds ---" % (time.time() - start_time))
    return 

def augment_data(df: pd.DataFrame, augmentation_factor: int = 5) -> pd.DataFrame:

    """
    Augments the data by adding augmented questions.
    
    Parameters:
        df: pd.DataFrame with "question" column
        augmentation_factor: int = 5; how many times to augment each question.
    """

    df_augmented = df.copy()
    for _ in range(augmentation_factor):
        augmented_dataset = df.copy()
        augmented_dataset["question"] = augmented_dataset["question"].apply(lambda x: apply_augmentation(x))
    
        df_augmented = pd.concat([df_augmented, augmented_dataset], axis=0).reset_index(drop=True)
    
    # Sort the dataset for sequential data.
    df_augmented = df_augmented.sort_values(by='timestamp').reset_index(drop=True)
    df_augmented.drop_duplicates(inplace=True)

    return df_augmented


In [None]:
dataset = augment_data(dataset)

In [None]:
dataset.head()