# Data Preprocsesing

In [1]:
import json

In [2]:
'''
with open('data/trec_corpus_20220301_plain.json') as read_from_file:
    with open('data/preprocessed_corputs.csv', 'a') as write_to_file:
        
        # write header of csv file
        write_to_file.write('id, text')
        doc = read_from_file.readline()

        for i in range(5): # while doc:
            
            # parse json
            doc_parsed = json.loads(doc)
            doc_text = doc_parsed['title'] + ' ' + doc_parsed['plain']
            
            print(doc_parsed)
            print('\n\n############\n\n')

            # do preprocessing
            preprocessed_text = 'test test test test test'

            # write preprocessed document to file (append)
            write_to_file.write(f'{doc_id}, {preprocessed_text}\n')

            # read next line
            doc = read_from_file.readline()
'''

"\nwith open('data/trec_corpus_20220301_plain.json') as read_from_file:\n    with open('data/preprocessed_corputs.csv', 'a') as write_to_file:\n        \n        # write header of csv file\n        write_to_file.write('id, text')\n        doc = read_from_file.readline()\n\n        for i in range(5): # while doc:\n            \n            # parse json\n            doc_parsed = json.loads(doc)\n            doc_text = doc_parsed['title'] + ' ' + doc_parsed['plain']\n            \n            print(doc_parsed)\n            print('\n\n############\n\n')\n\n            # do preprocessing\n            preprocessed_text = 'test test test test test'\n\n            # write preprocessed document to file (append)\n            write_to_file.write(f'{doc_id}, {preprocessed_text}\n')\n\n            # read next line\n            doc = read_from_file.readline()\n"

# Approach of Abda
- Loading data
- Preprocessing first 5 lines to save computation
- Appending preprecessed data into a dictionary / dataframe 
- Saving finished data after preprocessing into csv or feather file for continuous analysis and evaluation



In [3]:
# Loading necessary libraries
import pandas as pd
import nltk

In [4]:
# Loading dataset -> Only first five rows
df_prep = pd.read_json('data/trec_corpus_20220301_plain.json', lines=True, nrows = 100)

# Dropping url-column
df_prep.drop(columns=['url'], inplace=True, axis=1)
display(df_prep.head())

# Looking for missing / empty values
print('\nCheck first stats of first 10 rows')
print('------------------------------------')
print(df_prep.isna().sum().sort_values)

Unnamed: 0,id,title,plain
0,12148915,Keith Osik,"Keith Richard Osik (born October 22, 1968), is..."
1,16752449,"Swansons Landing, Texas",Swansons Landing is a settlement and former in...
2,31967453,Mike Potts,Mike or Michael Potts may refer to:\n Michael ...
3,47436994,Shuker,Shuker is a surname. Notable people with the s...
4,13924699,William Clark (inventor),William Clark (17 March 1821 – 22 January 1880...



Check first stats of first 10 rows
------------------------------------
<bound method Series.sort_values of id       0
title    0
plain    0
dtype: int64>


## __Cleaning and preprocessing textual data__
- Converting to lower case
- How to handle missing data (No necessaty...?)
- Removing punctuations
- Removing stopwords
- Tokenizing data (especially last column with plain text)
- Normalizing data (Stemming / Lemmatization)



In [None]:
# ========= Concatenate Title with Plain-Text =========
df_prep = df_prep.concat(['title', 'plain'])
df_prep.columns


In [5]:
# ========= Lower case whole column =========
df_prep['plain'] = df_prep['plain'].str.lower()
df_prep.sample(frac = 1).head()

Unnamed: 0,id,title,plain
50,52693423,Phidyle,phidyle is a genus of south american anyphaeni...
95,9553075,VMPS,vmps may refer to:\n vivekanand memorial publi...
82,55084366,1899 Sheriff of London Charity Shield,the 1899 sheriff of london charity shield was ...
58,41934267,Christian-Social People's Party (Liechtenstein),"the christian-social people's party (), often ..."
30,17696798,1969 VFL Grand Final,the 1969 vfl grand final was an australian rul...


In [15]:
df_prep['title'].sample(frac = 1).head()

42       Roman Catholic Archdiocese of Cape Coast
49                                 Paul Devautour
77    Yugoslavia women's national basketball team
3                                          Shuker
24                                        Guignen
Name: title, dtype: object

In [14]:
# ========= Tokenization =========
from nltk import word_tokenize
test_doc = df_prep.iloc[0,2]
test_doc_tokens = word_tokenize(test_doc)
print(test_doc_tokens)

def tokenize_words(plain_text):
    tokenized_text = word_tokenize(plain_text)
    return tokenized_text

df_prep_tokenized = df_prep['plain'].apply(lambda x: tokenize_words(x))


['keith', 'richard', 'osik', 'born', 'october', '22', '1968', 'former', 'major', 'league', 'baseball', 'catcher', 'played', 'major', 'leagues', 'played', 'milwaukee', 'brewers', 'pittsburgh', 'pirates', 'baltimore', 'orioles', 'washington', 'nationals', 'drafted', '24th', 'round', 'mlb', 'draft', 'brother', 'also', 'professional', 'baseball', 'player', 'played', 'minors', 'born', 'port', 'washington', 'new', 'york', 'lives', 'shoreham', 'new', 'york', 'osik', 'currently', 'head', 'baseball', 'coach', 'farmingdale', 'state', 'college', 'division', 'iii', 'institution', 'located', 'long', 'island', 'new', 'york', 'inducted', 'suffolk', 'sports', 'hall', 'fame', 'long', 'island', 'baseball', 'category', 'class', '2008', 'external', 'links', '1968', 'births', 'living', 'people', 'major', 'league', 'baseball', 'catchers', 'baseball', 'players', 'new', 'york', 'state', 'people', 'port', 'washington', 'new', 'york', 'milwaukee', 'brewers', 'players', 'pittsburgh', 'pirates', 'players', 'balti

In [6]:
# ========= Removing punctuations =========
import string
def remove_punctuations(plain_text):
    punctiations = string.punctuation
    return plain_text.translate(str.maketrans('', '', punctiations))

df_prep['plain'] = df_prep['plain'].apply(lambda x : remove_punctuations(x))

In [7]:
# ========= Removing stopwords =========
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))


def remove_stopwords(plain_text):
    return ' '.join([word for word in plain_text.split() if word not in STOPWORDS])

df_prep['plain'] = df_prep['plain'].apply(lambda x: remove_stopwords(x))



In [8]:
# ========= Removing special characters =========
import re

def remove_spec_char(plain_text):
    plain_text = re.sub('[^a-zA-Z0-9]', ' ', plain_text)
    plain_text = re.sub('\s+', ' ', plain_text)
    return plain_text

df_prep['plain'] = df_prep['plain'].apply(lambda x : remove_spec_char(x))

In [9]:
'''
# ========= Stemming =========
# For the Stemming we create a separate series which will append the last feature of the dataframe
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem_words(plain_text):
    return ' '.join([ps.stem(word) for word in plain_text.split()])

df_prep['plain_stemmed'] = df_prep['plain'].apply(lambda x : stem_words(x))
'''


In [10]:
# ========= Lemmatization =========
# For the Lemmatization we create a separate series which will append the last feature of the dataframe
from nltk import pos_tag
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {'N':wordnet.NOUN, 'V':wordnet.VERB, 'J':wordnet.ADJ, 'R':wordnet.ADV}

def lemmatize_word(plain_text):
    # Finind pos tags
    pos_text = pos_tag(plain_text.split())
    return ' '.join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

df_prep['plain_lemmatized'] = df_prep['plain'].apply(lambda x : lemmatize_word(x))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abderrahmanecharrade/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/abderrahmanecharrade/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
display(df_prep.head())

Unnamed: 0,id,title,plain,plain_stemmed,plain_lemmatized
0,12148915,Keith Osik,keith richard osik born october 22 1968 former...,keith richard osik born octob 22 1968 former m...,keith richard osik born october 22 1968 former...
1,16752449,"Swansons Landing, Texas",swansons landing settlement former inland port...,swanson land settlement former inland port har...,swanson land settlement former inland port har...
2,31967453,Mike Potts,mike michael potts may refer michael potts act...,mike michael pott may refer michael pott actor...,mike michael potts may refer michael potts act...
3,47436994,Shuker,shuker surname notable people surname include ...,shuker surnam notabl peopl surnam includ abrah...,shuker surname notable people surname include ...
4,13924699,William Clark (inventor),william clark 17 march 1821 22 january 1880 en...,william clark 17 march 1821 22 januari 1880 en...,william clark 17 march 1821 22 january 1880 en...


In [12]:
print('Stemmed')
print(df_prep.iloc[0,3])
print('\nLemmatized')
print(df_prep.iloc[0,4])


Stemmed
keith richard osik born octob 22 1968 former major leagu basebal catcher play major leagu play milwauke brewer pittsburgh pirat baltimor oriol washington nation draft 24th round mlb draft brother also profession basebal player play minor born port washington new york live shoreham new york osik current head basebal coach farmingdal state colleg divis iii institut locat long island new york induct suffolk sport hall fame long island basebal categori class 2008 extern link 1968 birth live peopl major leagu basebal catcher basebal player new york state peopl port washington new york milwauke brewer player pittsburgh pirat player baltimor oriol player washington nation player buffalo bison minor leagu player nashvil sound player durham bull player albuquerqu isotop player new orlean zephyr player farmingdal state ram basebal coach

Lemmatized
keith richard osik born october 22 1968 former major league baseball catcher play major league play milwaukee brewer pittsburgh pirate baltim

# __Finished preprocessing__
If the preprocessing fulfills the requirements for further operations and building IR-Models,
export finished dataframe as csv or [__feather__]('https://arrow.apache.org/docs/python/feather.html') file format (light-weighted option to csv) which saves in comparison to json and csv more computation power and cpu time<br></br>
Next task is to build up the vector space model and starting with tf-idf / term-weighting

----------------------------------------------------------------
## __Writing in feather file format__
import pyarrow.feather as feather<br>
feather.write_feather(df, '/path/to/file')

## __Writing in csv file format__
DataFrame.to_csv('/path/to/file')<br></br>

If the size of the finished and exported file is still too large, add it to .gitignorefile<br>
Github for private use (non-commercial) does not allows too large repositories