# Negatives Generation 
1. Import necessary libraries
2. Define the language model for translations
3. Clean Data
4. Generate back translations 
5. Process the data following the positive examples preprocessing

In [None]:
import pandas as pd
import json 
import os
import torch
import regex as re
from tqdm.notebook import tqdm
import sys
from transformers import T5ForConditionalGeneration, T5Tokenizer
import spacy


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model_name = 'jbochi/madlad400-3b-mt'
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
#import initial negatives
green = []
file_path = './ContrastSkill/data/Supervised/Green'
with open(file_path+'/train.json', 'r') as f:
    for line in f:
        green.append(json.loads(line.strip()))

In [None]:
#For extracitng negative examples 
def extract_negative_examples(data):
    
    negative_spans = []
    
    for example in data:
        if 'B' not in example['tags_skill']: 
            sentence = ' '.join(example['tokens'])
            negative_spans.append(sentence)
    
    return negative_spans

#For tokenizing
def tokenize_sentences(sentences):
    """
    Tokenizes a list of sentences into lists of tokens.
    """
    tokenized_sentences = [sentence.split() for sentence in sentences]
    return tokenized_sentences

In [None]:
negatives = extract_negative_examples(green)

In [None]:
negatives = pd.DataFrame(negatives, columns=['Sentences'])

In [None]:
#Remove HTML tags
def remove_html(text):
    text = re.sub(r'&nbsp;', ' ', text)
    text = re.sub(r'&nbsp\s+;', ' ', text)
    text = re.sub(r'nbsp;', ' ', text)
    text = re.sub(r'nbsp\s+;', ' ', text)
    text = re.sub(r'middot;', ' ', text)
    text = re.sub(r'middot\s+;', ' ', text)
    text = re.sub(r'ndash;', ' ', text)
    text = re.sub(r'ndash\s+;', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [None]:
#Get the negative examples 
negatives = negatives.drop_duplicates()
negatives['Sentences'] = negatives['Sentences'].apply(remove_html)
negatives['seq_len'] = negatives['Sentences'].apply(lambda x: len(x.split()))
negatives = negatives[(negatives['seq_len'] <= 35) & (negatives['seq_len'] > 5)]

In [None]:
#Store in a list for easier processing 
negative_sentences = [sentence for sentence in negatives['Sentences']]

In [None]:
def generate_translations(sentence_list, language=None):
    back_translated = []
    for original_sentence in tqdm(sentence_list, total=len(sentence_list)):
        #specify language
        if language == 'french':
            prefix = '<2fr> '
        elif language == 'portuguese':
            prefix = '<2pt> '
        elif language == 'chinese':
            prefix = '<2zh> '
        sentence = prefix + original_sentence
        input_ids = tokenizer(sentence, return_tensors="pt", max_length=40, truncation=True).input_ids.to(model.device)
        outputs = model.generate(input_ids=input_ids, max_new_tokens=40, repetition_penalty=1.5)
        translated = tokenizer.decode(outputs[0], skip_special_tokens=True, max_new_tokens=40, repetition_penalty=1.5)
        translated = '<2en> ' + translated
        #Back translation to english
        input_ids_translated = tokenizer(translated, return_tensors="pt", max_length=40, truncation=True).input_ids.to(model.device)
        outputs_translated = model.generate(input_ids=input_ids_translated, max_new_tokens=40, repetition_penalty=1.5)
        translated_back = tokenizer.decode(outputs_translated[0], skip_special_tokens=True, max_new_tokens=5, repetition_penalty=1.5)
        back_translated.append(translated_back)
    return back_translated

In [None]:
#Generate Negatives
back_neg_ch = generate_translations(negative_sentences, language='chinese')

In [None]:
#Apply simillar pre-processing as for the positive sample and drop duplicates
back_neg_ch = [string.replace(',', '') for string in back_neg_ch]
back_neg_ch = list(set(back_neg_ch))

In [None]:
#Transform into a pandas dataset to match the final structure 
total_neg_no_dup_df = pd.DataFrame(back_neg_ch, columns=['Sentence'])

In [None]:
#Tokenize to match the desired data structure 

lemmatizer_nlp = spacy.load('en_core_web_lg', exclude=['ner', 'parser', 'textcat'])  # For Lemmatization only

def spacy_tokenize_text(text):
    # Apply lemmatization using the lemmatizer model
    doc = lemmatizer_nlp(text)
    
    tokenz = [token.text for token in doc]

    return tokenz

def tokenize_relevant(df):
    #Ensure 'Sentences' column is of string type
    df['Sentence'] = df['Sentence'].astype(str)
    # Initialize an empty list to store the tokenized sentences
    df['Sentence_tokens'] = df['Sentence'].apply(spacy_tokenize_text)
    return df

In [None]:
#Function to add mask for the tokens 
def add_mask(df):
    token_mask = []
    
    for idx, row in df.iterrows():
        sentence_lemmatized_tokens = row['Sentence_tokens']
        mask = [1] * len(sentence_lemmatized_tokens)

        token_mask.append(mask)

    df['relevant_mask'] = token_mask
    df['Tokens'] = df['Sentence_tokens']
    df = df.drop(columns = ['Sentence', 'Sentence_tokens'])
    df = df[['Tokens', 'relevant_mask']]

    list_of_dict = df.to_dict(orient='records')
    return list_of_dict

In [None]:
#Apply the functions
total_neg_no_dup_df_tokenized = tokenize_relevant(total_neg_no_dup_df)
negatives_total = add_mask(total_neg_no_dup_df_tokenized)

In [None]:
#Export to a list of dict
with open('./ContrastSkill/Data/Pre-training/negatives_example.json', 'w') as file:
    json.dump(negatives_total, file, indent=4)