# Importar librerías necesarias

Importamos las librerías `pandas`, `nltk`, `re` y otras herramientas que utilizaremos para la manipulación y el procesamiento del texto.

In [2]:
import pandas as pd

df = pd.read_csv('../../data/codeforce_raw_data.csv', index_col=0, encoding='utf8')
df.head()

Unnamed: 0,time_limit,memory_limit,input_file,output_file,description,tags,points,rating
1846/F,1 second,256 megabytes,standard,standard,This is an interactive task.Rudolph is a scien...,"['constructive algorithms', 'implementation', ...",,
1847/D,2 seconds,256 megabytes,standard,standard,Josuke is tired of his peaceful life in Morioh...,"['data structures', 'dsu', 'greedy', 'implemen...",2000.0,
1846/E2,2 seconds,256 megabytes,standard,standard,This is the hard version of the problem. The o...,"['binary search', 'brute force', 'data structu...",,
1846/E1,2 seconds,256 megabytes,standard,standard,This is a simple version of the problem. The o...,"['brute force', 'implementation', 'math']",,
1846/C,1 second,256 megabytes,standard,standard,Rudolf has registered for a programming compet...,"['constructive algorithms', 'greedy', 'impleme...",,


In [3]:
import re

# This is an interactive task.Rudolph is a scientist who studies alien life forms.
# -> This is an interactive task. Rudolph is a scientist who studies alien life forms.
def processing_dot_capitalize(sentences):
    new_sentences = "" 
    prev = sentences[0]
    for t in sentences:
        if prev == '.' and t.isupper():
            new_sentences += ' '
        new_sentences += t
        prev = t
    return new_sentences

# ( $$$ 1 \\le t_{i, j} \\le 10^6 $$$ )
# -> ( $$$ 1 \\le t_{i, j} \\le 1000000 $$$ )
def replace_exponent_notation(text):
    def replace_exponent(match):
        exponent = int(match.group(1))
        return str(10 ** exponent)

    replaced_text = re.sub(r'10\^(\d+)', replace_exponent, text)

    return replaced_text

# There is a room in front of rudolph with $$$n$$$ different objects scattered around.
# -> There is a room in front of Rudolph with $$$ n $$$ different objects scattered around.
def add_spacing_between_dollar_signs(text):
    pattern = r'(?<=\$\$\$)(?=\S)|(?<=\S)(?=\$\$\$)'

    spaced_text = re.sub(pattern, ' ', text)

    return spaced_text

# This is an interactive task. 
# -> this is an interactive task.
def convert_to_lowercase(text):
    return text.lower()

def is_number(string: str):
    return string.isdigit()

def list_to_string(lst):
    return ' '.join(lst)

# 2 \\cdot 100000 
# -> 200000
def calculate_multiplication(text: str):
    t = text.split() 
    new_text = []  
    count = 0  
    
    for i in range(len(t)):
        if count > 0:
            count -= 1
            continue
        
        # Check if the current word and the word after it form a multiplication expression
        if not len(t) - i < 3 and is_number(t[i]) and is_number(t[i + 2]) and t[i + 1] == '\cdot':
            new_text.append(str(eval(t[i] + '*' + t[i + 2])))
            count = 2 
        else:
            new_text.append(t[i]) 
    
    return ' '.join(new_text)

In [4]:
def preprocessing(text):
    text = processing_dot_capitalize(text)
    text = convert_to_lowercase(text)
    text = add_spacing_between_dollar_signs(text)
    text = replace_exponent_notation(text)
    text = calculate_multiplication(text)
    return text

In [5]:
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/alex0125/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alex0125/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def split_sentences(sentences):
    return sent_tokenize(sentences)

def split_words(sentence):
    return word_tokenize(sentence)

In [7]:
def lemmatization(tokens):
    """Lemmatize each word in the list of tokens as verbs"""

    lmtzr = WordNetLemmatizer()
    
    tokens = [lmtzr.lemmatize(word, 'v') for word in tokens]
    
    return tokens

def remove_stopwords(tokens):
    """Remove stopwords with more tham one letter"""
    
    filtered_words = []  
    stopwords = nltk.corpus.stopwords.words('english')  
    stopwords = [item for item in stopwords if len(item) > 1] 
    
    for word in tokens:
        if word not in stopwords:
            filtered_words.append(word)
    
    return filtered_words

In [8]:
def get_preprocessing_sentence(tokens):
    filtered_words = remove_stopwords(tokens)
    filtered_words = lemmatization(filtered_words)
    return ' '.join(filtered_words).replace('$ $ $', '$$$')

In [9]:
def get_preprocessed_sentence(sentences):
    new_sentences = []

    sentences = preprocessing(sentences)
    sentences_list = split_sentences(sentences)
    
    for sentence in sentences_list:
        tokens = split_words(sentence)
        preprocessed_sentence = get_preprocessing_sentence(tokens)
        if preprocessed_sentence[-1] == '.':
            preprocessed_sentence = preprocessed_sentence[:-2]
        new_sentences.append(preprocessed_sentence.replace(' , ', ' '))
    return new_sentences

In [10]:
df = df.dropna(subset=['description'])

In [11]:
from tqdm import tqdm

new_description = []
for description in tqdm(df['description'].values):
    new_description.append(get_preprocessed_sentence(description))

100%|██████████| 7968/7968 [00:30<00:00, 263.61it/s]


In [12]:
df['description'] = new_description

In [13]:
df['description'] = df['description'].apply(list_to_string)
df.head()

Unnamed: 0,time_limit,memory_limit,input_file,output_file,description,tags,points,rating
1846/F,1 second,256 megabytes,standard,standard,interactive task rudolph a scientist study ali...,"['constructive algorithms', 'implementation', ...",,
1847/D,2 seconds,256 megabytes,standard,standard,josuke tire peaceful life morioh follow nephew...,"['data structures', 'dsu', 'greedy', 'implemen...",2000.0,
1846/E2,2 seconds,256 megabytes,standard,standard,hard version problem difference version $$$ n ...,"['binary search', 'brute force', 'data structu...",,
1846/E1,2 seconds,256 megabytes,standard,standard,a simple version problem difference version $$...,"['brute force', 'implementation', 'math']",,
1846/C,1 second,256 megabytes,standard,standard,rudolf register a program competition follow r...,"['constructive algorithms', 'greedy', 'impleme...",,


In [None]:
def dollar_processing(arr):
    """Replace consecutive '$' with '$$$'"""

    new_arr = []

    i = 0
    while i < len(arr):
        if i + 2 < len(arr) and arr[i] == '$' and arr[i + 1] == '$' and arr[i + 2] == '$':
            new_arr.append('$$$')
            i += 3
        else:
            new_arr.append(arr[i])
            i += 1
    return new_arr

In [None]:
from collections import Counter, defaultdict
from torchtext.data.utils import get_tokenizer

def tokenizing_sentences(sentences):
  
    tokenizer = get_tokenizer("spacy")

    new_sentences = []
    for sentence in sentences:
        tokens = tokenizer(sentence)
        new_sentences.append(dollar_processing(tokens))
        
    return new_sentences

In [None]:
df['description'] = tokenizing_sentences(df['description'])

In [None]:
df['description'] = df['description'].apply(list_to_string)
df.head()

In [None]:
df.to_csv('codeforce_processed_data.csv')