In [1]:
# pip install spacy

In [None]:
# python -m spacy download en_core_web_sm   NOT HERE, FROM CONDA CONSOLE

In [2]:
import spacy
import pandas as pd
import os

In [6]:
# Text preprocessing functions

# The strategy is to use functions without side effects - so do not modify the passes object itself, construct a new way
# that will be returned

def remove_excessive_space(text):
    '''
    Remove excessive white spaces like " ", \n, \t from the beginning and ending of text
    
    :param text - input text; it's a native python string
    :return: the given text without spaces; 
    :rtype: built-in python string
   
    '''
    return text.strip()

print(remove_excessive_space("\n\n This is a text and this is another one \n \n \t"))


def remove_punctuations(text, nlp_model):
    '''
    Remove all the punctuations from the given text
    
    :param text: the input text; it's a native python string
    :param nlp_model: NLP model that is used to preprocess the text; it's a spacy.lang object
    :return: the given text, without punctuations; 
    :rtype: built-in python string
    '''
    doc = nlp_model(text)
    
    no_punct = [token.text for token in doc if not token.is_punct]
    text_no_punct = ' '.join(no_punct)
    
    return text_no_punct

nlp_model = spacy.load("en_core_web_sm")
input_text = "Hello, this is a text! It contains punctuation marks. Really?"
res = remove_punctuations(input_text, nlp_model)
print(res)


def remove_stopwords(text, nlp_model):
    '''
    Remove all the stop words from the given text
    
    :param text: the input text; it's a native python string
    :param nlp_model: NLP model that is used to preprocess the text; it's a spacy.lang object
    :return: the given text, without stop words; 
    :rtype: built-in python string
    '''
    doc = nlp_model(text)
    
    no_stopwords = [token.text for token in doc if not token.is_stop]
    text_no_stopwords = ' '.join(no_stopwords)
    
    return text_no_stopwords

input_text = "A sentence that contain some stopwords and this fact is one that bother the ML process"
res = remove_stopwords(input_text, nlp_model)
print(res)

This is a text and this is another one
Hello this is a text It contains punctuation marks Really
sentence contain stopwords fact bother ML process


In [5]:
# IO functions
def read_txt_file(file_path):
    '''
    Return the content from the file from the given path. We assume the first line is the document title and the
    second line is document content
    
    :param file_path: path to the target file 
    :return: a dictionary with 2 entries: title and content of the file
    :rtype: built-in python dictionary
    '''
    result = dict()
    with open(file_path, 'r', encoding='utf-8') as file_obj:  
                result['title'] = file_obj.readline()
                result['content'] = file_obj.read()
    return result

In [7]:
def read_raw_data(main_directory_path):
    " read all files from all directories from the given path;  return a pandas df with 3 columns: document title, content and type (label) "
    df = pd.DataFrame(columns=['title','content','type'])
    directories = os.listdir(main_directory_path)
    
    new_files_contents = []
    
    for directory in directories:
        directory_path = main_directory_path + "\\" + directory
        files = os.listdir(directory_path)
        for file in files:
            file_path = directory_path + "\\" + file
            file_content = read_txt_file(file_path)
    
            whole_file_content_as_dict = pd.DataFrame({'title':file_content['title'], 'content':file_content['content'], 'type':directory}, index = [0])
            new_files_contents.append(whole_file_content_as_dict)
                   
    df = pd.concat([df] + new_files_contents, ignore_index=True)
            
    return df

data_root_path = "data"
df = read_raw_data(data_root_path)
df

Unnamed: 0,title,content,type
0,Lufthansa flies back to profit\n,\nGerman airline Lufthansa has returned to pro...,business
1,Winn-Dixie files for bankruptcy\n,\nUS supermarket group Winn-Dixie has filed fo...,business
2,US economy still growing says Fed\n,\nMost areas of the US saw their economy conti...,business
3,Saab to build Cadillacs in Sweden\n,"\nGeneral Motors, the world's largest car make...",business
4,Bank voted 8-1 for no rate change\n,\nThe decision to keep interest rates on hold ...,business
...,...,...,...
995,Mobile games come of age\n,\nThe BBC News website takes a look at how gam...,technology
996,California sets fines for spyware\n,\nThe makers of computer programs that secretl...,technology
997,Web helps collect aid donations\n,\nThe web is helping aid agencies gather resou...,technology
998,Mobiles rack up 20 years of use\n,\nMobile phones in the UK are celebrating thei...,technology


In [8]:
# load
nlp_model = spacy.load("en_core_web_sm")
type(nlp_model)

spacy.lang.en.English

In [9]:
first_row = df.iloc[0]
content = first_row['content']
doc = nlp_model(content)
for sentence in doc.sents:
    print(sentence)



German airline Lufthansa has returned to profit in 2004 after posting huge losses in 2003.


In a preliminary report, the airline announced net profits of 400m euros ($527.61m; £274.73m), compared with a loss of 984m euros in 2003.
Operating profits were at 380m euros, ten times more than in 2003.
Lufthansa was hit in 2003 by tough competition and a dip in demand following the Iraq war and the killer SARS virus.
It was also hit by troubles at its US catering business.
Last year, Lufthansa showed signs of recovery even as some European and US airlines were teetering on the brink of bankruptcy.
The board of Lufthansa has recommended paying a 2004 dividend of 0.30 euros per share.
In 2003, shareholders did not get a dividend.
The company said that it will give all the details of its 2004 results on 23 March.

