In [1]:
# pip install spacy

In [2]:
# python -m spacy download en_core_web_sm   NOT HERE, FROM CONDA CONSOLE

In [25]:
import spacy
import pandas as pd
import numpy as np
import os
from num2words import num2words

In [47]:
def is_numeric_str(s):
    try:
        # try converting to float
        float_value = float(s)
        return True
    except ValueError:
        try:
            # try converting to int
            int_value = int(s)
            return True
        except ValueError:
            return False

In [56]:
# Text preprocessing functions

# The strategy is to use functions without side effects - so do not modify the passes object itself, construct a new way
# that will be returned

nlp_model = spacy.load("en_core_web_sm")


def to_lowercase(text):
    return text.lower()

def remove_excessive_space(text):
    '''
    Remove excessive white spaces like " ", \n, \t from the beginning and ending of text
    
    :param text - input text; it's a native python string
    :return: the given text without spaces; 
    :rtype: built-in python string
   
    '''
    return text.strip()

print(remove_excessive_space("\n\n This is a text and this is another one \n \n \t"))


def remove_punctuations(words):
    '''
    Remove all the punctuations from the given text
    
    :param words: the input list that contains all the words and punctuations
    :return: a list with all words, without punctuations; 
    :rtype: built-in python list
    '''
    
    words_res = []
    for word in words:
        if word.is_punct is False:
            words_res.append(word)
    
    return words_res

def remove_stopwords(words):
    '''
    Remove all the stop words from the given text
    
    :param words: the input list that contains all the words
    :return: the given list, without stop words; 
    :rtype: built-in python list
    '''
    words_res = []
    for word in words:
        if word.is_stop is False:
            words_res.append(word)
    
    return words_res

def lemmatize_words(words):
    '''
    Apply lemmatization for a list of words. 
    
    :param words: the input list with words; every element is a spacy.tokens.token.Token object
    :return: a list constructed from the initial one but every with is lemmatized (converted to base form)
    :rtype: built-in python list, every element is a spacy.tokens.token.Token object
    '''
    
    words_res = []
    for word in words:
        words_res.append(word.lemma_)
    
    return words_res

def handle_numerical_values(word, method='text'):
    '''
   Decide what to do with numerical values (keep them or remove them)
   
   :param word:
   :param method:
   '''
    if method == 'text':
        word = re.sub(r'\d+', 'NUM', word)
    elif method == 'remove':
        word = re.sub(r'\d+', '', word)
    return word
   
def handle_rare_words_and_typos(words, threshold=2, replacement='[UNK]'):
    '''
   Decide if we want to replace rare words or not
   
   :param text:
   :param threshold:
   :param replacement:
   :return:
   :rtype:
   '''
    word_freq = {word: words.count(word) for word in set(words)}
    rare_words = [word for word, freq in word_freq.items() if freq <= threshold]
    
    processed_words = [replacement if word in rare_words else word for word in words]
    
    return processed_words
    

# part of speech for every word
def words_pos(words):
    words_res = []
    for word in words:
        words_res.append( (word, word.pos_) ) 
    
    return words_res

def get_tokens_from_raw_text(text, nlp_model):
    '''
    Convert a raw text to a built-in python list of spacy.tokens.token.Token object (tokens); 
    
    :param text: the input text; it's a native python string
    :param nlp_model: NLP model that is used to preprocess the text; it's a spacy.lang object
    :return: list of words extracted from the input text
    :rtype: built-in python list
    '''
    doc = nlp_model(text)
    words = []
    for token in doc:
        words.append(token)
        
    return words

def remove_junk_spaces(tokens, nlp_model):
    
    # convert spacy tokens to str tokens
    tokens = convert_spacy_tokens_to_str_tokens(tokens)
    
    # remove extra spaces with strip
    tokens = [remove_excessive_space(token) for token in tokens]
    
    
    junk_spaces = ['\n', '\t', '\r', '\v', '\f', '&nbsp;', '\xA0', '', ' ']
    
    # remove other junk spaces
    tokens = [token for token in tokens if token not in junk_spaces]
    
    tokens = convert_str_tokens_to_spacy_tokens(tokens, nlp_model)
    
    return tokens



def convert_years_to_spoken_words(tokens, nlp_model):
    # convert years to spoken words, eg. "1990" to 'nineteen ninety'
    # we consider years as integer values with 4 digits, and the value itself
    # between valid_year_min_value to valid_year_max_value
    
    valid_year_min_value = 1000
    valid_year_max_value = 2100
    
    # convert tokens from spacy entity to built-in string
    tokens = convert_spacy_tokens_to_str_tokens(tokens)
    new_tokens = []
    
    for token in tokens:
        if token.isnumeric() and len(token) == 4:
            year = int(token)
            if year >= valid_year_min_value and year <= valid_year_max_value:
                year_as_words = num2words(year, to = 'year')
                new_tokens.append(year_as_words)
            # logica
        else:
            # just append it like this
            new_tokens.append(token)
    
    new_tokens = convert_str_tokens_to_spacy_tokens(new_tokens, nlp_model)
    
    return new_tokens

def convert_numeric_values_to_spoken_words(tokens, nlp_model):
    # conver numerical values (eg. '54', '2.5') to spoken words
    
    new_tokens = []
    
    # convert tokens from spacy entity to built-in string
    tokens = convert_spacy_tokens_to_str_tokens(tokens)
    
    for token in tokens:
        if is_numeric_str(token):
            
            token_as_numeric = float(token)
            token_as_spoken_words = num2words(token_as_numeric)
            new_tokens.append(token_as_spoken_words)
            if token == '0.30':
                print(token_as_numeric)
        else:
            new_tokens.append(token)
    
    new_tokens = convert_str_tokens_to_spacy_tokens(new_tokens, nlp_model)
    
    return new_tokens
    

def convert_str_tokens_to_spacy_tokens(tokens, nlp_model):
    # convert string tokens back into spacy entities
    raw_text = ' '.join(tokens)
    tokens = get_tokens_from_raw_text(raw_text, nlp_model)
    
    return tokens

def convert_spacy_tokens_to_str_tokens(tokens):
    # convert tokens from spacy entity to built-in string
    tokens = [token.text for token in tokens]
    return tokens
    
    
# input_text = "The quick brown foxes are jumping over the lazy dogs. They were running through the forests, exploring the mysterious caves. I saw many interesting books on the shelves and decided to read them all. The children were playing happily in the parks, swinging on the swings and climbing on the jungle gym. Despite the challenges, they were determined to succeed in their endeavors."
# words_input = get_tokens_from_raw_text(input_text, nlp_model)
# print("Tokens:")
# print(words_input)
# print("-" * 15)
# words_input = remove_punctuations(words_input)
# print("Without punctuations:")
# print(words_input)
# print("-" * 15)
# words_input = remove_stopwords(words_input)
# print("Without stopwords:")
# print(words_input)
# print("-" * 15)
# words_input = lemmatize_words(words_input)
# print("After lemmatization:")
# print(words_input)
# print("-" * 25)

# # need to convert again to text and then to tokenize because the lemmatization convert words to built in string
# words_as_single_text = ' '.join(words_input)
# words_input = get_tokens_from_raw_text(words_as_single_text, nlp_model)
# words_and_pos = words_pos(words_input)
# print("Part of speech:")
# print(words_and_pos)
# print("-" * 15)


This is a text and this is another one


In [4]:
# IO functions
def read_txt_file(file_path):
    '''
    Return the content from the file from the given path. We assume the first line is the document title and the
    second line is document content
    
    :param file_path: path to the target file 
    :return: a dictionary with 2 entries: title and content of the file
    :rtype: built-in python dictionary
    '''
    result = dict()
    with open(file_path, 'r', encoding='utf-8') as file_obj:  
                result['title'] = file_obj.readline()
                result['content'] = file_obj.read()
    return result

In [5]:
def read_raw_data(main_directory_path):
    " read all files from all directories from the given path;  return a pandas df with 3 columns: document title, content and type (label) "
    df = pd.DataFrame(columns=['title','content','type'])
    directories = os.listdir(main_directory_path)
    
    new_files_contents = []
    
    for directory in directories:
        directory_path = main_directory_path + "\\" + directory
        files = os.listdir(directory_path)
        for file in files:
            file_path = directory_path + "\\" + file
            file_content = read_txt_file(file_path)
    
            whole_file_content_as_dict = pd.DataFrame({'title':file_content['title'], 'content':file_content['content'], 'type':directory}, index = [0])
            new_files_contents.append(whole_file_content_as_dict)
                   
    df = pd.concat([df] + new_files_contents, ignore_index=True)
            
    return df

data_root_path = "data"
df = read_raw_data(data_root_path)
df

Unnamed: 0,title,content,type
0,Lufthansa flies back to profit\n,\nGerman airline Lufthansa has returned to pro...,business
1,Winn-Dixie files for bankruptcy\n,\nUS supermarket group Winn-Dixie has filed fo...,business
2,US economy still growing says Fed\n,\nMost areas of the US saw their economy conti...,business
3,Saab to build Cadillacs in Sweden\n,"\nGeneral Motors, the world's largest car make...",business
4,Bank voted 8-1 for no rate change\n,\nThe decision to keep interest rates on hold ...,business
...,...,...,...
995,Mobile games come of age\n,\nThe BBC News website takes a look at how gam...,technology
996,California sets fines for spyware\n,\nThe makers of computer programs that secretl...,technology
997,Web helps collect aid donations\n,\nThe web is helping aid agencies gather resou...,technology
998,Mobiles rack up 20 years of use\n,\nMobile phones in the UK are celebrating thei...,technology


In [59]:
# dummy text classification

# load
nlp_model = spacy.load("en_core_web_sm")

first_doc = df.iloc[0]['content']

# first_doc = remove_excessive_space(first_doc)

# tokens = get_tokens_from_raw_text(first_doc, nlp_model)
# print(first_doc)
# print(tokens)

print(first_doc)

def custom_tokenizer(raw_text, nlp_model):
    
    # convert to lower case
    #raw_text = to_lowercase(raw_text)
    
    # remove extra spaces in the first phase
    raw_text = remove_excessive_space(raw_text)
    
    # get tokens
    tokens = get_tokens_from_raw_text(raw_text, nlp_model)
    
    # remove junk extra spaces
    tokens = remove_junk_spaces(tokens, nlp_model)
      
    # handle years value - convert years as numerical value into spoken words
    tokens = convert_years_to_spoken_words(tokens, nlp_model)
    
    # convert currency symbols into spoken words - MAYBE NOT, just removed them
    
    # convert articulated date into spoken words (e.g '3rd' -> 'third')
    
    # convert the left numerical values (int, float) into spoken words
    tokens = convert_numeric_values_to_spoken_words(tokens, nlp_model)
    
    # remove punctuations
    tokens = remove_punctuations(tokens)
 
    # remove stop words
    tokens = remove_stopwords(tokens)

    # lemmatization
    tokens = lemmatize_words(tokens)
    # after this, the tokens are not longer spacy.tokens.token.Token, but built-in java string
    

    return tokens


res_tokens = custom_tokenizer(first_doc, nlp_model)

#res_tokens = np.array(res_tokens)
print(type(res_tokens))
print(res_tokens)


# from num2words import num2words

# print(num2words(1990, to = 'year'))
# print(num2words(1990))


German airline Lufthansa has returned to profit in 2004 after posting huge losses in 2003.

In a preliminary report, the airline announced net profits of 400m euros ($527.61m; £274.73m), compared with a loss of 984m euros in 2003. Operating profits were at 380m euros, ten times more than in 2003. Lufthansa was hit in 2003 by tough competition and a dip in demand following the Iraq war and the killer SARS virus. It was also hit by troubles at its US catering business. Last year, Lufthansa showed signs of recovery even as some European and US airlines were teetering on the brink of bankruptcy. The board of Lufthansa has recommended paying a 2004 dividend of 0.30 euros per share. In 2003, shareholders did not get a dividend. The company said that it will give all the details of its 2004 results on 23 March.

0.3
<class 'list'>
['german', 'airline', 'Lufthansa', 'return', 'profit', 'thousand', 'post', 'huge', 'loss', 'thousand', 'preliminary', 'report', 'airline', 'announce', 'net', 'prof

In [32]:
text = df.iloc[0]['content']
tokens = get_tokens_from_raw_text(text, nlp_model)
tokens = remove_junk_spaces(tokens, nlp_model)

print(tokens)

[German, airline, Lufthansa, has, returned, to, profit, in, 2004, after, posting, huge, losses, in, 2003, ., 

 , In, a, preliminary, report, ,, the, airline, announced, net, profits, of, 400, m, euros, (, $, 527.61, m, ;, £, 274.73, m, ), ,, compared, with, a, loss, of, 984, m, euros, in, 2003, ., Operating, profits, were, at, 380, m, euros, ,, ten, times, more, than, in, 2003, ., Lufthansa, was, hit, in, 2003, by, tough, competition, and, a, dip, in, demand, following, the, Iraq, war, and, the, killer, SARS, virus, ., It, was, also, hit, by, troubles, at, its, US, catering, business, ., Last, year, ,, Lufthansa, showed, signs, of, recovery, even, as, some, European, and, US, airlines, were, teetering, on, the, brink, of, bankruptcy, ., The, board, of, Lufthansa, has, recommended, paying, a, 2004, dividend, of, 0.30, euros, per, share, ., In, 2003, ,, shareholders, did, not, get, a, dividend, ., The, company, said, that, it, will, give, all, the, details, of, its, 2004, results, on, 2

In [53]:
# first_row = df.iloc[0]
# content = first_row['content']
# doc = nlp_model(content)
# for sentence in doc.sents:
#     print(sentence)

print(type(num2words(1990, to = 'year')))
print(num2words(2004 , to = 'year'))
print(num2words(23, to = 'ordinal'))
print(num2words(0.30))

<class 'str'>
two thousand and four
twenty-third
zero point three


In [17]:
a = spacy.tokens.token.Token("da")
print(a)

TypeError: __cinit__() takes exactly 3 positional arguments (1 given)

In [1]:
from transformers import BertTokenizer

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Tokenize the text
text = "The fox is running run. I saw a running fox."
#text = "I left my phone on the left side of the room."
encoded_text = tokenizer.encode(text)
tokens = tokenizer.tokenize(tokenizer.decode(encoded_text))

# Convert tokens to IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Print the tokenized text and corresponding token IDs
# print("Tokenized Text:", tokens)
# print("Token IDs:", input_ids)

for token, token_id in zip(tokens, input_ids):
    print(token,token_id)

[CLS] 101
The 1109
fox 17594
is 1110
running 1919
run 1576
. 119
I 146
saw 1486
a 170
running 1919
fox 17594
. 119
[SEP] 102


In [5]:
print(type(tokens[0]))

<class 'str'>
