# NLP Preprocessing Pipeline

In [15]:
# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# nltk download
nltk.download('punkt') # token
nltk.download('stopwords')

import pandas as pd
import spacy
import re
import os

[nltk_data] Downloading package punkt to /Users/countzero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/countzero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Methods
def quick_view(lst, num):
    print(f'[*] Quick view(total {num}):')
    for i in range(10):
        print(f'\t{i + 1}.{lst[i]}')

### Data

In [3]:
# Input data
fi = 'drugsComTrain_raw.csv'
df = pd.read_csv(fi)
df

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37
...,...,...,...,...,...,...,...
161292,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10,31-May-15,125
161293,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1,1-Nov-11,34
161294,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2,15-Mar-14,35
161295,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10,19-Sep-15,79


In [4]:
# Drop all na
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160398 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uniqueID     160398 non-null  int64 
 1   drugName     160398 non-null  object
 2   condition    160398 non-null  object
 3   review       160398 non-null  object
 4   rating       160398 non-null  int64 
 5   date         160398 non-null  object
 6   usefulCount  160398 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 9.8+ MB


## 1. Text

### 1.1. Text Lowercasing

In [5]:
# Lowercasing text
for column in df.columns:
    if df[column].dtype == 'object':
        print(f'[*] Lowercasing {column}...')
        df[column] = df[column].str.lower()
    
    else:
        print(f'[*] {df[column].dtype} - {column}: PASS')

[*] int64 - uniqueID: PASS
[*] Lowercasing drugName...
[*] Lowercasing condition...
[*] Lowercasing review...
[*] int64 - rating: PASS
[*] Lowercasing date...
[*] int64 - usefulCount: PASS


### 1.2. Noise Removal

In [6]:
# Remove everything except alphabetic and number
# Output: clean_texts -> list()
pattern = re.compile(r'[^a-zA-Z0-9\s]')

clean_texts = []
for i in df.index:
    print(f'\r[*] Progress: {round(round((i + 1)/df.shape[0], 3)*100, 2)}%', end='')
    clean_text = re.sub(pattern, ' ', df.loc[i, 'review'])
    clean_texts.append(clean_text)

clean_texts_num = len(clean_texts)
print('\n[*] Done.')
quick_view(clean_texts, clean_texts_num)

[*] Progress: 100.6%
[*] Done.
[*] Quick view(total 160398):
	1. it has no side effect  i take it in combination of bystolic 5 mg and fish oil 
	2. my son is halfway through his fourth week of intuniv  we became concerned when he began this last week  when he started taking the highest dose he will be on  for two days  he could hardly get out of bed  was very cranky  and slept for nearly 8 hours on a drive home from school vacation  very unusual for him   i called his doctor on monday morning and she said to stick it out a few days  see how he did at school  and with getting up in the morning  the last two days have been problem free  he is much more agreeable than ever  he is less emotional  a good thing   less cranky  he is remembering all the things he should  overall his behavior is better  
we have tried many different medications and so far this is the most effective  
	3. i used to take another oral contraceptive  which had 21 pill cycle  and was very happy  very light periods  

### 1.3. Stopwords Removal

In [7]:
# Remove stopwords from clean_texts base on nltk library
# Output: clean_texts -> list()
stop_wds = stopwords.words('english')

tmp = []
for i in range(clean_texts_num):
    print(f'\r[*] Progress: {round(round((i + 1)/clean_texts_num, 3)*100, 2)}%', end='')
    clean_text = ' '.join([wd for wd in clean_texts[i].split(' ') if wd not in stop_wds])
    tmp.append(clean_text)

clean_texts = tmp[:]
tmp = []

clean_texts_num = len(clean_texts)
print('\n[*] Done.')
print(f'[*] Quick view(total {clean_texts_num}):')
for i in range(10):
    print(f'\t{i + 1}.{clean_texts[i]}')

[*] Progress: 100.0%
[*] Done.
[*] Quick view(total 160398):
	1. side effect  take combination bystolic 5 mg fish oil 
	2. son halfway fourth week intuniv  became concerned began last week  started taking highest dose  two days  could hardly get bed  cranky  slept nearly 8 hours drive home school vacation  unusual   called doctor monday morning said stick days  see school  getting morning  last two days problem free  much agreeable ever  less emotional  good thing   less cranky  remembering things  overall behavior better  
we tried many different medications far effective  
	3. used take another oral contraceptive  21 pill cycle  happy  light periods  max 5 days  side effects  contained hormone gestodene  available us  switched lybrel  ingredients similar  pills ended  started lybrel immediately  first day period  instructions said  period lasted two weeks  taking second pack  two weeks   third pack things got even worse  third period lasted two weeks  039 end third week  still daily 

### 1.4. Entity Recognition

In [21]:
# Identifying important entities
# Output: entities_catalog -> dict()
recognition = spacy.load("en_core_web_sm")

entities_catalog = {}
for i in range(clean_texts_num):
    print(f'\r[*] Progress: {round(round((i + 1)/clean_texts_num, 3)*100, 2)}%', end='')
    doc = recognition(clean_texts[i])
    for ent in doc.ents:
        entities_catalog.setdefault(ent.label_, [])
        entities_catalog[ent.label_].append(ent.text)

entities_catalog_num = len(entities_catalog.keys())
print('\n[*] Done.')
print(f'[*] Quick view(total {entities_catalog_num}):')
for i in range(10):
    curr_k = entities_catalog.keys()[i]
    print(f'\t{i + 1}.{curr_k} - {entities_catalog[curr_k][1:10]}')

fo_path = os.path.join(os.path.dirname(fi), 'entities_catalog.csv')
print(f'[*] Generating entities_catalog.csv...')
entities_catalog_df = pd.DataFrame({'entity name': entities_catalog.keys(), 'entity contents': entities_catalog.values()})
entities_catalog_df.to_csv(fo_path, index=False)

[*] Progress: 69.4%

KeyboardInterrupt: 

## 2. Token

### 2.1. Tokenization

In [8]:
# Tokenize data
# Output: tokens -> list()
tokens = []

for i in range(clean_texts_num):
    print(f'\r[*] Progress: {round(round((i + 1)/clean_texts_num, 3)*100, 2)}%', end='')
    token_wd = word_tokenize(clean_texts[i])
    tokens.append(token_wd)

tokens_num = len(tokens)
print('\n[*] Done.')
print(f'[*] Quick view(total {tokens_num}):')
for i in range(10):
    print(f'\t{i + 1}.{tokens[i]}')

[*] Progress: 100.0%
[*] Done.
[*] Quick view(total 160398):
	1.['side', 'effect', 'take', 'combination', 'bystolic', '5', 'mg', 'fish', 'oil']
	2.['son', 'halfway', 'fourth', 'week', 'intuniv', 'became', 'concerned', 'began', 'last', 'week', 'started', 'taking', 'highest', 'dose', 'two', 'days', 'could', 'hardly', 'get', 'bed', 'cranky', 'slept', 'nearly', '8', 'hours', 'drive', 'home', 'school', 'vacation', 'unusual', 'called', 'doctor', 'monday', 'morning', 'said', 'stick', 'days', 'see', 'school', 'getting', 'morning', 'last', 'two', 'days', 'problem', 'free', 'much', 'agreeable', 'ever', 'less', 'emotional', 'good', 'thing', 'less', 'cranky', 'remembering', 'things', 'overall', 'behavior', 'better', 'we', 'tried', 'many', 'different', 'medications', 'far', 'effective']
	3.['used', 'take', 'another', 'oral', 'contraceptive', '21', 'pill', 'cycle', 'happy', 'light', 'periods', 'max', '5', 'days', 'side', 'effects', 'contained', 'hormone', 'gestodene', 'available', 'us', 'switched', 

### 2.2. Normalization

In [9]:
# Normalization with nltk
# Output: tokens -> list()
stemmer = PorterStemmer()

tmp = []
for i in range(tokens_num):
    print(f'\r[*] Progress: {round(round((i + 1)/tokens_num, 3)*100, 2)}%', end='')
    lemmatized_token = [stemmer.stem(wd) for wd in tokens[i]]
    tmp.append(lemmatized_token)

lemmatized_tokens = tmp[:]
tmp = []

tokens_num = len(lemmatized_tokens)
print('\n[*] Done.')
quick_view(lemmatized_tokens, tokens_num)

[*] Progress: 100.0%
[*] Done.
[*] Quick view(total 160398):
	1.['side', 'effect', 'take', 'combin', 'bystol', '5', 'mg', 'fish', 'oil']
	2.['son', 'halfway', 'fourth', 'week', 'intuniv', 'becam', 'concern', 'began', 'last', 'week', 'start', 'take', 'highest', 'dose', 'two', 'day', 'could', 'hardli', 'get', 'bed', 'cranki', 'slept', 'nearli', '8', 'hour', 'drive', 'home', 'school', 'vacat', 'unusu', 'call', 'doctor', 'monday', 'morn', 'said', 'stick', 'day', 'see', 'school', 'get', 'morn', 'last', 'two', 'day', 'problem', 'free', 'much', 'agreeabl', 'ever', 'less', 'emot', 'good', 'thing', 'less', 'cranki', 'rememb', 'thing', 'overal', 'behavior', 'better', 'we', 'tri', 'mani', 'differ', 'medic', 'far', 'effect']
	3.['use', 'take', 'anoth', 'oral', 'contracept', '21', 'pill', 'cycl', 'happi', 'light', 'period', 'max', '5', 'day', 'side', 'effect', 'contain', 'hormon', 'gestoden', 'avail', 'us', 'switch', 'lybrel', 'ingredi', 'similar', 'pill', 'end', 'start', 'lybrel', 'immedi', 'first

### 2.3. Token Filtering

In [10]:
# Filter all non-alphabetic word
# Output: alpha_tokens -> list()
alpha_tokens = []
for i in range(tokens_num):
    print(f'\r[*] Progress: {round(round((i + 1)/tokens_num, 3)*100, 2)}%', end='')
    alpha_tokens.append([wd for wd in lemmatized_tokens[i] if wd.isalpha()])

tokens_num = len(alpha_tokens)
print('\n[*] Done.')
quick_view(alpha_tokens, tokens_num)

[*] Progress: 100.0%
[*] Done.
[*] Quick view(total 160398):
	1.['side', 'effect', 'take', 'combin', 'bystol', 'mg', 'fish', 'oil']
	2.['son', 'halfway', 'fourth', 'week', 'intuniv', 'becam', 'concern', 'began', 'last', 'week', 'start', 'take', 'highest', 'dose', 'two', 'day', 'could', 'hardli', 'get', 'bed', 'cranki', 'slept', 'nearli', 'hour', 'drive', 'home', 'school', 'vacat', 'unusu', 'call', 'doctor', 'monday', 'morn', 'said', 'stick', 'day', 'see', 'school', 'get', 'morn', 'last', 'two', 'day', 'problem', 'free', 'much', 'agreeabl', 'ever', 'less', 'emot', 'good', 'thing', 'less', 'cranki', 'rememb', 'thing', 'overal', 'behavior', 'better', 'we', 'tri', 'mani', 'differ', 'medic', 'far', 'effect']
	3.['use', 'take', 'anoth', 'oral', 'contracept', 'pill', 'cycl', 'happi', 'light', 'period', 'max', 'day', 'side', 'effect', 'contain', 'hormon', 'gestoden', 'avail', 'us', 'switch', 'lybrel', 'ingredi', 'similar', 'pill', 'end', 'start', 'lybrel', 'immedi', 'first', 'day', 'period', '

## Output

In [11]:
# New data
# clean_texts -> list()
# tokens -> list()
# alpha_tokens -> list()

df.insert(df.shape[1], 'clean_texts', clean_texts, allow_duplicates=True)
df.insert(df.shape[1], 'tokens', tokens, allow_duplicates=True)
df.insert(df.shape[1], 'alpha_tokens', alpha_tokens, allow_duplicates=True)

fo_path = os.path.join(os.path.dirname(fi), f'preprocessed_{os.path.basename(fi)}')
df.to_csv(fo_path, index=False)