# NLP Preprocessing Pipeline

In [None]:
# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# nltk download
nltk.download('punkt') # token
nltk.download('stopwords')

import pandas as pd
import spacy
import re
import os

In [None]:
# Methods

# Just for print information
def quick_view(lst, num):
    print(f'[*] Quick view(total {num}):')
    for i in range(10):
        print(f'\t{i + 1}.{lst[i]}')

### Data

In [None]:
# Input data
fi = 'NLP_Data/drugsComTrain_raw.csv'
df = pd.read_csv(fi)
df

In [None]:
# Drop date
df.drop('date', axis=1, inplace=True)

# Drop uniqueID
df.drop('uniqueID', axis=1, inplace=True)

# Drop all na
df.dropna(inplace=True)

df.info()

## 1. Text

### 1.1. Text Lowercasing

In [None]:
# Lowercasing text
for column in df.columns:
    # Lowercasing just 'object' Dtype
    if df[column].dtype == 'object':
        print(f'[*] Lowercasing {column}...')
        df[column] = df[column].str.lower()
    
    else:
        print(f'[*] {df[column].dtype} - {column}: PASS')

### 1.2. Noise Removal

In [None]:
# Remove everything except alphabetic and number
# Output: clean_texts -> list()
pattern = re.compile(r'[^a-zA-Z0-9\s]')

clean_texts = []
for i in df.index:
    print(f'\r[*] Progress: {round(round((i + 1)/df.shape[0], 3)*100, 2)}%', end='')
    clean_text = re.sub(pattern, ' ', df.loc[i, 'review'])
    clean_texts.append(clean_text)

clean_texts_num = len(clean_texts)
print('\n[*] Done.')
quick_view(clean_texts, clean_texts_num)

### 1.3. Stopwords Removal

In [None]:
# Remove stopwords from clean_texts base on nltk library
# Output: clean_texts -> list()

# Stop words table (from nltk)
stop_wds = stopwords.words('english') 

tmp = []
for i in range(clean_texts_num):
    print(f'\r[*] Progress: {round(round((i + 1)/clean_texts_num, 3)*100, 2)}%', end='')
    
    clean_text = ' '.join([wd for wd in clean_texts[i].split(' ') if wd not in stop_wds])
    tmp.append(clean_text)

clean_texts = tmp[:]
tmp = []

clean_texts_num = len(clean_texts)
print('\n[*] Done.')
quick_view(clean_texts, clean_texts_num)

### 1.4. Entity Recognition

In [None]:
# Identifying important entities and save them to entities_catalog.csv
# Output: entities_catalog -> dict()

# Entities table (from spacy)
recognition = spacy.load("en_core_web_sm")

entities_catalog = {}
for i in range(clean_texts_num):
    print(f'\r[*] Progress: {round(round((i + 1)/clean_texts_num, 3)*100, 2)}%', end='')
    
    doc = recognition(clean_texts[i]) # recognize core entities
    for ent in doc.ents:
        entities_catalog.setdefault(ent.label_, [])
        entities_catalog[ent.label_].append(ent.text)

entities_catalog_num = len(entities_catalog.keys())
print('\n[*] Done.')
print(f'[*] Quick view(total {entities_catalog_num}):')
for i in range(10):
    curr_k = list(entities_catalog.keys())[i]
    print(f'\t{i + 1}.{curr_k} - {entities_catalog[curr_k][1:10]}')

fo_path = os.path.join(os.path.dirname(fi), 'entities_catalog.csv')
print(f'[*] Generating entities_catalog.csv...')
entities_catalog_df = pd.DataFrame({'entity name': entities_catalog.keys(), 'entity contents': entities_catalog.values()})
entities_catalog_df.to_csv(fo_path, index=False)

## 2. Token

### 2.1. Tokenization

In [None]:
# Tokenize data
# Output: tokens -> list()
tokens = []

for i in range(clean_texts_num):
    print(f'\r[*] Progress: {round(round((i + 1)/clean_texts_num, 3)*100, 2)}%', end='')
    
    token_wd = word_tokenize(clean_texts[i])
    tokens.append(token_wd)

tokens_num = len(tokens)
print('\n[*] Done.')
print(f'[*] Quick view(total {tokens_num}):')
for i in range(10):
    print(f'\t{i + 1}.{tokens[i]}')

### 2.2. Normalization

In [None]:
# Lemmatization with nltk
# Output: tokens -> list()
stemmer = PorterStemmer()

tmp = []
for i in range(tokens_num):
    print(f'\r[*] Progress: {round(round((i + 1)/tokens_num, 3)*100, 2)}%', end='')

    lemmatized_token = [stemmer.stem(wd) for wd in tokens[i]]
    tmp.append(lemmatized_token)

lemmatized_tokens = tmp[:]
tmp = []

tokens_num = len(lemmatized_tokens)
print('\n[*] Done.')
quick_view(lemmatized_tokens, tokens_num)

### 2.3. Token Filtering

In [None]:
# Filter all non-alphabetic word
# Output: alpha_tokens -> list()
alpha_tokens = []
for i in range(tokens_num):
    print(f'\r[*] Progress: {round(round((i + 1)/tokens_num, 3)*100, 2)}%', end='')
    
    alpha_tokens.append([wd for wd in lemmatized_tokens[i] if wd.isalpha()])

tokens_num = len(alpha_tokens)
print('\n[*] Done.')
quick_view(alpha_tokens, tokens_num)

## Output

In [None]:
# New data
# clean_texts -> list()
# tokens -> list()
# alpha_tokens -> list()

df.insert(df.shape[1], 'clean_texts', clean_texts, allow_duplicates=True)
df.insert(df.shape[1], 'tokens', tokens, allow_duplicates=True)
df.insert(df.shape[1], 'alpha_tokens', alpha_tokens, allow_duplicates=True)

fo_path = os.path.join(os.path.dirname(fi), f'preprocessed_{os.path.basename(fi)}')
df.to_csv(fo_path, index=False)