### This notebook is for analyzing the steps during processing data. It contains a lot more documentation and code than the original script. 

In [None]:
import numpy as np
import pandas as pd
import os
import time

import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
keys_to_filter = os.getenv('KEYS_TO_FILTER').split(',')
concatenated_path = os.getenv('CONCATENATED_PATH')
dataset_path = "Datasets/concatenated.csv"

In [None]:
dataset = pd.read_csv(dataset_path)
dataset = pd.DataFrame(dataset)
dataset.head(100)

In [None]:
def remove_urls(text):
      return re.sub(r'http\S+', 'redacted', text)
def remove_english_words(text):
    # Looks for all English words and removes them.
    pattern = r'\b[a-zA-Z]+\b'
    return re.sub(pattern, '', text)
def delete_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text
def remove_mention(text):
  mention_regex = r"@\w+"
  return re.sub(mention_regex, "/mention", text)
def redact_email(text): 
    return re.sub(r'\S+@\S+', '/email', text)
def remove_password(text): 
    copy_text = text
    pass_pattern = r'[A-Za-z0-9@#$%^&+=]{8,}'
    text_ = re.sub(pass_pattern, '', text)
    return text_
def remove_whitespace(text):
    return  " ".join(text.split())
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [None]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, ' ', data)

In [None]:
def filter_sensitive_words(sentence, replacement='CENSORED'):
    """
    Parameters: 
        replacement: str = words that will be substituted instead of the sensitive words   
    """
    words = set(keys_to_filter)
    sentence_words = sentence.split()
    
    modified_sentence = [
        replacement if word in words else word for word in sentence_words
    ]
    
    # Join the list back into a sentence
    return ' '.join(modified_sentence)
    
    return sentence

In [None]:
def preprocess_data(text):
      text = remove_english_words(text)
      text = remove_password(text)
      text = redact_email(text)
      text = remove_urls(text)
      text = remove_mention(text)
      text = delete_html_tags(text)
      text = filter_sensitive_words(text)
      text = remove_whitespace(text)
      
      return text

In [None]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    import time 
    dataset_copy = df.copy()
    start_time= time.time()
    df['Message'] = df['Message'].apply(preprocess_data)
    df["Message"] = df["Message"].apply(lambda x: remove_emojis(str(x)) if isinstance(x, str) else ' ')
    df.to_csv(concatenated_path, index=False)
    end_time = time.time()

    total_time = end_time - start_time
    print(f"Total time for processing: {total_time:.2f} seconds")

    
    return df

In [None]:
dataset_copy = dataset.copy() # For visual purposes
dataset = preprocess_dataset(dataset)

b_length = len(dataset_copy)
a_length = len(dataset)
b_mid_length = np.mean(dataset_copy['Message'].str.len())
a_mid_length = np.mean(dataset['Message'].str.len())
b_max_length = np.max(dataset_copy['Message'].str.len())
a_max_length = np.max(dataset['Message'].str.len())
longest_sentence = max(dataset['Message'])


print(f"Changes (Before/After) processing:")
print(f"Length: {b_length} -> {a_length}")
print(f"Median length: {b_mid_length:.2f} -> {a_mid_length:.2f}")
print(f"Max sentence length: {b_max_length} -> {a_max_length}")
print(f"Nan values: {dataset_copy.isna().sum().sum()} -> {dataset.isna().sum().sum()}")
print(f"Longest sentence: {len(longest_sentence)} chars: {longest_sentence}")

del dataset_copy

### Diving into Question / Answer 

In [None]:
def separate_sentences(df: pd.DataFrame) -> pd.DataFrame:
      """
      Takes a pandas dataframe with a messages column and returns separated rows with question / answer columns
      Args: 
            dataset: pd.DataFrame
            Dataset should contain a messages column and first row with identification who sent a message.


      Returns:
            dataset: pd.DataFrame
            
            Dataset divided into question / answer columns.
      """

      separated_dataset = pd.DataFrame(columns=['question', 'answer', "timestamp"])

      if df["Sent_by_me"].iloc[0]: 
            df = df.drop(df.index[0]).reset_index(drop=True)

      questions_df = df[df.index % 2 == 0].reset_index(drop=True)
      answers_df = df[df.index % 2 == 1].reset_index(drop=True)

      min_length = min(len(questions_df), len(answers_df))
      questions_df = questions_df["Message"][:min_length]
      answers_df = answers_df["Message"][:min_length]
      
      separated_dataset["question"] = questions_df
      separated_dataset["answer"] = answers_df
      separated_dataset["timestamp"] = df["Date"][:min_length]

      return separated_dataset

In [None]:
separated_dataset = separate_sentences(dataset)
separated_dataset.head(100)

del dataset

### Adding column with previous context 

In [None]:
def add_context(df: pd.DataFrame, context_size: int = 20) -> pd.DataFrame:
    """
    Add a column with previous context to the DataFrame.
    """

    context_list = []

    for index in range(len(df)):
        start_index = max(index - context_size, 0)
        context = df.loc[start_index:index - 1, ["question", "answer"]]

        message = []
        for key, (question, answer) in enumerate(zip(context["question"], context["answer"])):
            message.append(f"Q{key + 1}: {question}. A{key + 1}: {answer}::")

        # Append the concatenated message to the context list
        context_list.append(" ".join(message))

    # Add the context as a new column
    df["context"] = context_list

    # Replace empty contexts with "New Conversation"
    df["context"] = df["context"].apply(lambda x: "Missing Context" if len(x) == 0 else x)
    

    return df

In [None]:
separated_dataset = add_context(separated_dataset)

In [None]:
separated_dataset

# Data Augmentation 
Steps: 
1. On context | response dataset, process only responses and get the array of used words and how often they were used. 
2. Look up for dictionaries that contain synonyms to most used word.
3. Go through all responses and think about the algorithm that would be able to efficiently augment the responses. 