In [1]:
# Here we are using the Tika packages to import word documents
# If you want to learn more, please go to: https://tika.apache.org/
import tika

from tika import parser

# This are system packages for processing the data
import re
import os
import datetime
import io
import string
import time

# Data handling
import pandas as pd
import numpy as np

# This package works for text processing
import spacy
# Load English model for SpaCy
nlp = spacy.load("en_core_web_sm")



# Data importing

In [2]:
def preprocess(text, 
               min_token_len = 2, 
               irrelevant_pos = ['PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']): 
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text 
    and return a preprocessed string. 
    
    Parameters
    -------------
    text : (str) 
        the text to be preprocessed
    min_token_len : (int) 
        min_token_length required
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    
    Returns
    -------------
    (str) the preprocessed text
    """
    try:
        doc = nlp(text)
    except:
        return "missing value"
    
    results = []
    counter = 0
    
    for token in doc:
        
        # Irrelevant POS
        if token.pos_ in irrelevant_pos:
            continue
            
        # Stop words
        if token.is_stop:
            continue
        
        # Word length    
        if len(token)<2:
            continue
            
        # Email
        if token.like_email or token.like_url:
            continue
            
        if counter>300:
            break
            
        results.append(token.lemma_)
        counter+=1
   
    if len(results)<=1:
        final_text="no info"
    else:
        final_text=" ".join(results)
    
    return final_text

In [3]:
def text_importing(directory, beginning):
    '''
    Imports the text data in a given folder, splits the text in paragraphs,
    and returns a pandas dataset with the features
    
    Parameters:
    ---------------------
    directory (string):
        root folder path
    
    beginning (string):
        string that indicates the beginning of the text, if None the text starts at position 0.
        
    Returns:
    ---------------------
    pd.Dataframe:
        imported text with features (document_name, Interviewee, Interviewer, Notetaker, Date, Considerations, Corpus)
    '''
    
    dict_text = {
        'Document_name': [],
        'Interviewee': [],
        'Interviewer': [],
        'Notetaker': [],
        'Date': [],
        'Considerations':[],
        'Corpus': []
    }
    
    # Directory type handling
    if not os.path.isdir(directory):
        raise Exception("Directory does not exist")
    
    # Iterating through the files of the directory
    for subdirectory in os.listdir(directory):
        folder_name = f'{directory}{subdirectory}/'
       
        for filename in os.listdir(folder_name):
            file_data = parser.from_file(folder_name + filename)
            text = file_data['content']

            # Filling the features
            dict_text['Document_name'].append(filename)

            # Extracting the interviewee
            interviewee = re.search('(Interviewee: )(.*)(\\n)', text)
            if not interviewee is None: 
                dict_text['Interviewee'].append(interviewee.group(2))
            else:
                dict_text['Interviewee'].append(np.nan)

            # Extracting the interviewer
            interviewer = re.search('(Interviewer: )(.*)(\\n)', text)
            if not interviewer is None: 
                dict_text['Interviewer'].append(interviewer.group(2))
            else:
                dict_text['Interviewer'].append(np.nan)

            # Extracting the notetaker
            notetaker = re.search('(Notetaker: )(.*)(\\n)', text)
            if not notetaker is None: 
                dict_text['Notetaker'].append(notetaker.group(2))
            else:
                dict_text['Notetaker'].append(np.nan)

            # Extracting the date
            date = re.search('(Date: )(.*)(\\n)', text)
            if not date is None: 
                try:
                    date_obj = datetime.datetime.strptime(date.group(2), '%m/%d/%Y')
                    date_obj = date_obj.date()
                except:
                    date_obj = date.group(2)

                dict_text['Date'].append(date_obj)
            else:
                dict_text['Date'].append(np.nan)

            # Extracting the notetaker
            considerations = re.search('(Procedural considerations: )(.*)(\\n)', text)
            if not considerations is None: 
                dict_text['Considerations'].append(considerations.group(2))
            else:
                dict_text['Considerations'].append(np.nan)

            # Cropping the beginning of the text        
            if beginning is not None:
                text_start = re.search(beginning, text)

                if not text_start is None: 
                    ts_index = text_start.span()[0]
                    text_clean = text[ts_index:]
                    text_for_dict = text_clean.split("\n")[9:]
                else:
                    text_for_dict = text.split("\n")            
            else:
                text_for_dict = text.split("\n")[9:]            

            dict_text['Corpus'].append(list(enumerate(text_for_dict)))

    return(pd.DataFrame(dict_text))
        

In [4]:
# These 2 functions are made to extract the positions
def extract_first(word_tuple):
    first = word_tuple[0]
    return first

def extract_second(word_tuple):
    second = word_tuple[1]
    return second

In [None]:
# This is the path where the documents are, PLEASE CHANGE ACCORDINGLY
text = text_importing("../data/", "Future of Peacekeeping – Interviews")

In [None]:
text

In [8]:
text_by_paragraph = text.explode('Corpus')
text_by_paragraph['Order'] = text_by_paragraph['Corpus'].apply(extract_first)
text_by_paragraph['Corpus'] = text_by_paragraph['Corpus'].apply(extract_second)
text_by_paragraph['Length_corpus'] = text_by_paragraph['Corpus'].apply(len)
text_by_paragraph['key'] = text_by_paragraph[['Document_name', 'Order']].apply(lambda x: '-'.join(x.astype(str)), axis=1)

text_by_paragraph['Interviewee'] = text_by_paragraph['Interviewee'].fillna("No interviewee")

In [None]:
text_by_paragraph.head()

In [10]:
text_by_paragraph = text_by_paragraph.query('Length_corpus > 30')
text_by_paragraph['Preprocessed_text'] = text_by_paragraph['Corpus'].apply(preprocess)

In [11]:
text_by_paragraph.to_csv("../processed_data/text_by_paragraph.csv")
text.to_csv("../processed_data/text.csv")

## Dictionary

In [12]:
def dict_importing(directory, beginning):
    '''
    Imports the dictionary of words to be used
    and returns a pandas dataset with the features
    
    Parameters:
    ---------------------
    directory (string):
        root folder path
    
    beginning (string):
        string that indicates the beginning of the text, if None the text starts at position 0.
    
    Returns:
    ---------------------
    pd.Dataframe:
        imported dictionary with features (word, group)
    '''
    
    dict_text = {
        'word': [],
        'group': [],
    }
    
    # Directory type handling
    if not os.path.isdir(directory):
        raise Exception("Directory does not exist")
        
    # Iterating through the files of the directory
    for filename in os.listdir(directory):
        file_data = parser.from_file(directory + filename)
        text = re.sub("[\\n\\t]", "", file_data['content'])
        text = re.sub("[\\xa0]", " ", text)
        
        if beginning is not None:
            text_start = re.search(beginning, text)
            
            if not text_start is None: 
                ts_index = text_start.span()[0] + 2
                text_clean = text[ts_index:]
                text_for_dict = re.split("[0-9][0-9]?\. ",text_clean)
            else:
                text_for_dict = re.split("[0-9][0-9]?\. ",text)           
        else:
            text_for_dict = re.split("[0-9][0-9]?\. ",text)
  
    for word in text_for_dict:
        word_start = re.search("a\.", word)
        
        if word_start is None:
            category=""
            subword_list=[]
        else:
            category = word[:word_start.span()[0]]
            subword_list = re.split("[,] ?",word[word_start.span()[0]+3:])
        
        dict_text['word'].append(category)
        dict_text['group'].append(category)
        
        for sw in subword_list:
            dict_text['word'].append(sw)
            dict_text['group'].append(category)
            
    return pd.DataFrame(dict_text)

In [15]:
dictionary_df = dict_importing("../dic/", "1.")

In [None]:
dictionary_df

In [17]:
dictionary_df.to_csv("../processed_data/dictionary.csv")