# Data Preprocsesing

In [1]:
# Loading necessary libraries
import pandas as pd
import string
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
# Comment two lines below out if not packages 'wordnet' & 'omw-1.4' not available 
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ma/ma_ma/ma_fvogl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/ma/ma_ma/ma_fvogl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# functions used for preprocessing

def tokenize_words(plain_text):
    tokenized_text = word_tokenize(plain_text)
    return tokenized_text

def remove_punctuations(plain_text):
    punctiations = string.punctuation
    return plain_text.translate(str.maketrans('', '', punctiations))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(plain_text):
    return ' '.join([word for word in plain_text.split() if word not in STOPWORDS])

def remove_spec_char(plain_text):
    plain_text = re.sub('[^a-zA-Z0-9]', ' ', plain_text)
    plain_text = re.sub('\s+', ' ', plain_text)
    return plain_text

lemmatizer = WordNetLemmatizer()
wordnet_map = {'N':wordnet.NOUN, 'V':wordnet.VERB, 'J':wordnet.ADJ, 'R':wordnet.ADV}
def lemmatize_word(plain_text):
    # Finind pos tags
    pos_text = pos_tag(plain_text.split())
    return ' '.join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

ps = PorterStemmer()
def stem_words(plain_text):
    return ' '.join([ps.stem(word) for word in plain_text.split()])

In [None]:
# log file containing last successfully preprocessed chunk index
# in case of the cluster shutting down or errors occuring the preprocessing does not need to start from the beging
logfile_path = 'data/chunks_log.txt'
try:
    f = open(logfile_path, "r")
    last_chunk_idx = int(f.read())
    f.close()
except FileNotFoundError:
    last_chunk_idx = -1

# ========= Generate iterator loading 100000 lines each time =========
df_iterator = pd.read_json('data/trec_corpus_20220301_plain.json', lines=True, chunksize=100000)

# ========= iterate over chunks =========
for i, df_chunk in enumerate(df_iterator):
    # skip chunk if preprocessing is already done for this chunk
    if i <= last_chunk_idx:
        continue
        
    # progress indication
    print(f'chunk {i+1} / 65...', end=' ')

    # ========= Dropping url-column =========
    df_chunk.drop(columns=['url'], inplace=True, axis=1)
    
    # ========= Concatenating title column with plain-text column =========
    df_chunk['plain-text'] = df_chunk['title'].astype(str) + ' ' + df_chunk['plain'].astype(str)
    df_chunk.drop(['title', 'plain'], axis = 1, inplace = True)
    df_chunk.rename(columns = {'plain-text' : 'plain'}, inplace = True)
    
    print('lowercasing...', end=' ')
    # ========= Lower case whole column =========
    df_chunk['plain'] = df_chunk['plain'].str.lower()
    df_chunk.sample(frac = 1).head()
    
    # ========= Tokenization =========
    #df_chunk['plain'] = df_chunk['plain'].apply(lambda x: tokenize_words(x))
    
    # ========= Removing punctuations =========
    print('removing punctuation...', end=' ')
    df_chunk['plain'] = df_chunk['plain'].apply(lambda x : remove_punctuations(x))

    # ========= Removing stopwords =========
    print('removing stopwords...', end=' ')
    df_chunk['plain'] = df_chunk['plain'].apply(lambda x: remove_stopwords(x))

    # ========= Removing special characters =========
    print('removing special chars...', end=' ')
    df_chunk['plain'] = df_chunk['plain'].apply(lambda x : remove_spec_char(x))

    # ========= Lemmatization =========
    #print('lemmatization...', end=' ')
    #df_chunk['plain'] = df_chunk['plain'].apply(lambda x : lemmatize_word(x))
    
    # ========= Stemming =========
    print('stemming...', end=' ')
    df_chunk['plain'] = df_chunk['plain'].apply(lambda x : stem_words(x))
    
    # ========= write to file =========
    print('write to file.')
    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    # Add header if it is the first chunk
    header = i == 0
    df_chunk.to_csv(
        "data/preprocessed_corpus.csv",
        index=False,
        header=header, 
        mode=mode)
    
    # write chunk index to log file
    f = open(logfile_path, "w")
    f.write(str(i))
    f.close()

chunk 43 / 65... lowercasing... removing punctuation... removing stopwords... removing special chars... stemming... 