# Data Preprocsesing

In [None]:
# import json

In [None]:
'''
with open('data/trec_corpus_20220301_plain.json') as read_from_file:
    with open('data/preprocessed_corputs.csv', 'a') as write_to_file:
        
        # write header of csv file
        write_to_file.write('id, text')
        doc = read_from_file.readline()

        for i in range(5): # while doc:
            
            # parse json
            doc_parsed = json.loads(doc)
            doc_text = doc_parsed['title'] + ' ' + doc_parsed['plain']
            
            print(doc_parsed)
            print('\n\n############\n\n')

            # do preprocessing
            preprocessed_text = 'test test test test test'

            # write preprocessed document to file (append)
            write_to_file.write(f'{doc_id}, {preprocessed_text}\n')

            # read next line
            doc = read_from_file.readline()
'''

# Approach of Abda
- Loading data
- Preprocessing first 5 lines to save computation
- Appending preprecessed data into a dictionary / dataframe 
- Saving finished data after preprocessing into csv or feather file for continuous analysis and evaluation



In [5]:
# Loading necessary libraries
import pandas as pd
import nltk

In [16]:
# Loading dataset -> Only first five rows
df_prep = pd.read_json('/Users/abderrahmanecharrade/Desktop/Uni-Mannheim-Sonder/04_Semester/IRWS/Data/trec_corpus_20220301_plain.json', lines=True, nrows = 100)

# Dropping url-column
df_prep.drop(columns=['url'], inplace=True, axis=1)
display(df_prep.head())

# Looking for missing / empty values
print('\nCheck first stats of first 10 rows')
print('------------------------------------')
print(df_prep.isna().sum().sort_values)

# Concatenating title column with plain-text column
df_prep['plain-text'] = df_prep['title'].astype(str) + ' ' + df_prep['plain'].astype(str)
df_prep.drop(['title', 'plain'], axis = 1, inplace = True)
df_prep.rename(columns = {'plain-text' : 'plain'}, inplace = True)
display(df_prep.head())

Unnamed: 0,id,title,plain
0,12148915,Keith Osik,"Keith Richard Osik (born October 22, 1968), is..."
1,16752449,"Swansons Landing, Texas",Swansons Landing is a settlement and former in...
2,31967453,Mike Potts,Mike or Michael Potts may refer to:\n Michael ...
3,47436994,Shuker,Shuker is a surname. Notable people with the s...
4,13924699,William Clark (inventor),William Clark (17 March 1821 – 22 January 1880...



Check first stats of first 10 rows
------------------------------------
<bound method Series.sort_values of id       0
title    0
plain    0
dtype: int64>


## __Cleaning and preprocessing textual data__
- Converting to lower case
- How to handle missing data (No necessaty...?)
- Removing punctuations
- Removing stopwords
- Tokenizing data (especially last column with plain text)
- Normalizing data (Stemming / Lemmatization)



In [None]:
# ========= Lower case whole column =========
df_prep['plain'] = df_prep['plain'].str.lower()
df_prep.sample(frac = 1).head()

In [None]:
df_prep['title'].sample(frac = 1).head()

In [None]:
# ========= Tokenization =========
from nltk import word_tokenize
test_doc = df_prep.iloc[0,2]
test_doc_tokens = word_tokenize(test_doc)
print(test_doc_tokens)

def tokenize_words(plain_text):
    tokenized_text = word_tokenize(plain_text)
    return tokenized_text

df_prep_tokenized = df_prep['plain'].apply(lambda x: tokenize_words(x))


In [None]:
# ========= Removing punctuations =========
import string
def remove_punctuations(plain_text):
    punctiations = string.punctuation
    return plain_text.translate(str.maketrans('', '', punctiations))

df_prep['plain'] = df_prep['plain'].apply(lambda x : remove_punctuations(x))

In [None]:
# ========= Removing stopwords =========
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))


def remove_stopwords(plain_text):
    return ' '.join([word for word in plain_text.split() if word not in STOPWORDS])

df_prep['plain'] = df_prep['plain'].apply(lambda x: remove_stopwords(x))



In [None]:
# ========= Removing special characters =========
import re

def remove_spec_char(plain_text):
    plain_text = re.sub('[^a-zA-Z0-9]', ' ', plain_text)
    plain_text = re.sub('\s+', ' ', plain_text)
    return plain_text

df_prep['plain'] = df_prep['plain'].apply(lambda x : remove_spec_char(x))

In [None]:
'''
# ========= Stemming =========
# For the Stemming we create a separate series which will append the last feature of the dataframe
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem_words(plain_text):
    return ' '.join([ps.stem(word) for word in plain_text.split()])

df_prep['plain_stemmed'] = df_prep['plain'].apply(lambda x : stem_words(x))
'''


In [None]:
# ========= Lemmatization =========
# For the Lemmatization we create a separate series which will append the last feature of the dataframe
from nltk import pos_tag
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {'N':wordnet.NOUN, 'V':wordnet.VERB, 'J':wordnet.ADJ, 'R':wordnet.ADV}

def lemmatize_word(plain_text):
    # Finind pos tags
    pos_text = pos_tag(plain_text.split())
    return ' '.join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

df_prep['plain_lemmatized'] = df_prep['plain'].apply(lambda x : lemmatize_word(x))

In [None]:
display(df_prep.head())

In [None]:
print('Stemmed')
print(df_prep.iloc[0,3])
print('\nLemmatized')
print(df_prep.iloc[0,4])


# __Finished preprocessing__
If the preprocessing fulfills the requirements for further operations and building IR-Models,
export finished dataframe as csv or [__feather__]('https://arrow.apache.org/docs/python/feather.html') file format (light-weighted option to csv) which saves in comparison to json and csv more computation power and cpu time<br></br>
Next task is to build up the vector space model and starting with tf-idf / term-weighting

----------------------------------------------------------------
## __Writing in feather file format__
import pyarrow.feather as feather<br>
feather.write_feather(df, '/path/to/file')

## __Writing in csv file format__
DataFrame.to_csv('/path/to/file')<br></br>

If the size of the finished and exported file is still too large, add it to .gitignorefile<br>
Github for private use (non-commercial) does not allows too large repositories