# Task 2
Date: 05.04.2019<br>
Environment: Python 3.6.8 and Anaconda 4.6.7 (64-bit)<br>

In [1]:
from IPython.core.display import HTML
css = open('style/style-table.css').read() + open('style/style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

## 1. Introduction
>The main purpose of this report is to provide information about the methodology and process to solve this problem.<br>
This report focuses on two parts:
* Parsing pdf file to get pre-processed text. 
* The logics and implementation of exploring pre-processed text and generating features.

## 2.  Logic map

> The procedures implemented in the text pre-processing task are as follows:
<img src = "style/img_4_logicmap.png" height = "500" width = "500" style="float: left;">

## 3.  Import libraries 

In [3]:
import re
import ast
import nltk 
import itertools
import pandas as pd
import numpy as np
from itertools import chain
from nltk.util import ngrams
from nltk.stem import PorterStemmer
from nltk.probability import *
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
from sklearn.feature_extraction.text import CountVectorizer

## 4. Read xml data and excel data
> Here, we use two methods to parse the pdf file:
>1. First method is to use **`Adobe Acrobat`** to export the pdf into `.xml` file.
>2. The second method is to use online tool **`Pdf to Excel`** to generate the `.csv`file.
>3. Then we use the parsed text data generated from different tools to verify the correctness.

In [4]:
# pre-processing the xml file
f = open('style/30086434.xml', 'r')
data_str = f.read()  
f.close()
data_str = re.sub('&quot;','"',data_str)
data_str = re.sub('&amp;','&',data_str)

# use </TR> as separator to split the file into 201 blocks and remove the first blank one
unit_block_tr = re.split(r'</TR>',data_str)
unit_doc_list_raw = unit_block_tr[1:]

# pre-processing the string from xml file
unit_doc_list = []
for each_doc in unit_doc_list_raw[:-1]:
    unit_doc_list.append(each_doc.lstrip('\n\n<TR>'))
unit_doc_list.append(unit_doc_list_raw[-1].lstrip('\n</Table>\n\n<'))

# generate three lists for data extracted from xml file
ucode_list = []
out_list = []
syn_list = []
for i,each_doc in enumerate(unit_doc_list):
    l = re.findall(r'(?<=>)(.*?)(?=<)',each_doc)
    ucode_list.append(l[0].rstrip())
    syn_list.append(l[1].rstrip())
    out_list.append(l[2])

In [5]:
# extract data from excel file
excel_data = pd.ExcelFile('style/30086434-converted.xlsx')
df = excel_data.parse('Table 1')
df.head(2)
# pre-processing the data based on the cross-check with data from xml file
ucode_excel= [i for i in df['Title']]
outc_excel = [i.replace('\n',' ').lstrip('[').rstrip(']') for i in df['Outcomes']]
df.loc[101,'Synopsis']='NA'
syn_excel = [str(i).replace('\n',' ') for i in df['Synopsis']]
print(f'# of unitcode: {len(ucode_excel)}\n'\
      f'# of synopsis: {len(syn_excel)}\n'\
      f'# of outcomes: {len(outc_excel)}')

# of unitcode: 200
# of synopsis: 200
# of outcomes: 200


## 5. Create the raw text dict
> After we have the separate list, we can then build our initial raw text dict.<br>
> **<font color=blue>Special consideration:</font>** 
    1. There are duplicate units in our dataset which means if we add them all sequentially, 
        the text for duplicate unit will be doubled.
    2. Each outcome has [ ] around which probably means a list containing elements instead of one str. 
**<font color=blue>Adjustment solution:</font>** 
    1. We skip the existed key if it's already in the dict.
    2. We still separate the synopsis and outcome as elements of a list at this stage.
       The dict at this stage looks like: {unitcode : [[synopsis],[outcomes]], ...}

In [6]:
unit_dict = {}
for i in range(200):
    if ucode_excel[i] not in unit_dict.keys():
        unit_dict[ucode_excel[i]]= []
        unit_dict[ucode_excel[i]].append(syn_excel[i])
        unit_dict[ucode_excel[i]].append(outc_excel[i])
    else:
        print(f'duplicate unit: {ucode_excel[i]}') # print duplicate units in case of further processing
print(f'# of units in dict: {len(unit_dict)}')

duplicate unit: APG5666
duplicate unit: ATS1339
duplicate unit: ATS2354
# of units in dict: 197


In [7]:
# double check if each outcome string is around with [ and ]
count = 0
for k,v in unit_dict.items():
    if v[1].startswith('[') and v[1].endswith(']'):
        count+=1
print(f'# of outcomes with []: {count}')

# of outcomes with []: 0


## 6. Text Pre-processing
### 6.1 Sentence Segmentation,  Tokenization and Case Normalization
> In order to generate our initial vocab, we will implement the following sub-tasks: <br>
    1. Sentence segmentation
    2. Tokenization
    3. Case Normalization
> **<font color=blue>Special consideration:</font>** <br>
> The bracket around outcomes need to be handled properly.<br>
> * If we apply sent_tokenize directly on the outcome string, most of the time, it will return the whole outcome string instead of sentences in that brackets.<br>
> * i.e. We expect that the return result of `['This is sentence A;','Another sentence B;']` should be `This is sentence A` and `Another sentence B` instead of a whole string `This is sentence A;','Another sentence B`.<br> 
> * As in the second senario, we will miss the case normalization for token **`Another`**. We might end up with **`Another`** and **`another`** appearing in the same text.<br>

> **<font color=blue>Potential Solution:</font>** <br>
> We can use **ast.literal_eval** function to transform the outcomes into meaningful sentence elements.

In [8]:
def sen_Seg_Norm_Token(paragraph_str):
    tokens_list = []
    tokenizer = RegexpTokenizer(r"\w+(?:[-']\w+)?", gaps=False)
    # Sentence segmentation for the paragraph
    for sen in sent_tokenize(paragraph_str):
        sen_tokens = tokenizer.tokenize(sen)
        # for each token in the sentence, normalize to lower case.
        if sen_tokens:
            sen_tokens[0] = sen_tokens[0].lower()
        tokens_list.extend(sen_tokens)
    return tokens_list

In [9]:
unit_tokens_raw = {}
for u_code,u_info_list in unit_dict.items():
    # combine the Synopsis and Outcomes
    unit_info=u_info_list[0]+' '+u_info_list[1]
    # pass whole str into tokenization function
    raw_tokens = sen_Seg_Norm_Token(unit_info)
    unit_tokens_raw[u_code]=raw_tokens

> Overview of our initial vocab bag

In [10]:
corpus_tokens = list(chain.from_iterable(unit_tokens_raw.values()))
print (f'Vocabulary size : {len(set(corpus_tokens))}'\
       f'\nTotal # of tokens: {len(corpus_tokens)}'\
       f'\nLexical diversity: {len(corpus_tokens)/len(set(corpus_tokens))}')

Vocabulary size : 4442
Total # of tokens: 31275
Lexical diversity: 7.040747411076092


### 6.2 Generate the 200 bigram collocations
> * Now we use our tokens generated to generate the 1st 200 meaningful bigrams collocations.<br>
> * We've already used the chain.frome_iterable function to concatenate all the tokens. <br>
    The returned list  `corpus_tokens ` contains a list of all the words seprated by while space.<br>
> * **PMI scores** are used to find the best 200 bigrams.

In [11]:
## using pmi measure to find 200 meaning ful bigrams
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(corpus_tokens)
bigram_200 = bigram_finder.nbest(bigram_measures.pmi, 200)
bigram_200_list = [f'{a}_{b}' for a,b in bigram_200]

### 6.3 Re-tokenization

> * Now it's time to replace the corresponding unigrams by the bigram collocations generated. 
> * We can use the **MWETokenizer** to re-tokenize the sentence with multi-word expressions

In [12]:
mwe_tokenizer = MWETokenizer(bigram_200)
unit_tokens_mwe = {u_code:mwe_tokenizer.tokenize(tokens) for u_code,tokens in unit_tokens_raw.items()}

In [13]:
corpus_tokens_mwe = list(chain.from_iterable(unit_tokens_mwe.values()))
print (f'Vocabulary size after retokenization : {len(set(corpus_tokens_mwe))}'\
       f'\nTotal # of tokens after retokenization: {len(corpus_tokens_mwe)}'\
       f'\nLexical diversity: {len(corpus_tokens_mwe)/len(set(corpus_tokens_mwe))}')

Vocabulary size after retokenization : 4279
Total # of tokens after retokenization: 31112
Lexical diversity: 7.270857677027343


### 6.4 Removing 'Bad Features' ( i.e. stop words, the most and less frequent words)

> **<font color=blue>Stop Words Removal</font>**
> * Define a **filter function** for easily filter our token list. 
> * Define and remove the context-independent function words from each token list.

In [14]:
def token_Filter(token_list, undesired_list):
    return [w for w in token_list if w not in undesired_list]

In [15]:
stopwords = []
with open('style/stopwords_en.txt') as f:
    stopwords = f.read().splitlines()
# get rid of the stopwords and construct the new dict 
unit_tokens_Sw = {u_code:token_Filter(tokens, stopwords) for u_code,tokens in unit_tokens_mwe.items()}

In [16]:
corpus_tokens_Sw = list(chain.from_iterable(unit_tokens_Sw.values()))
print (f'Vocab size after removing stopwords : {len(set(corpus_tokens_Sw))}'\
       f'\nTotal # of tokens after after removing stopwords: {len(corpus_tokens_Sw)}'\
       f'\nLexical diversity: {len(corpus_tokens_Sw)/len(set(corpus_tokens_Sw))}')

Vocab size after removing stopwords : 4045
Total # of tokens after after removing stopwords: 18673
Lexical diversity: 4.616316440049443


> **<font color=blue>Frequent and Rare words Removal based on Document Frequency</font>**
> * We put all the tokens in a list using **chain.from_iterable** and past it to **FreqDist**.
> * The set makes sure that each word in an article appears only once, thus the total number of 
    times a word appears in all the sets is equal to the number of documents containing that word.
> * Then we filter out the bad context-dependent according to threshould

In [17]:
words = list(chain.from_iterable([set(token) for token in unit_tokens_Sw.values()]))
doc_freq = FreqDist(words)

In [18]:
doc_freq.most_common()

[('unit', 155),
 ('students', 107),
 ('skills', 84),
 ('understanding', 75),
 ('analyse', 73),
 ('apply', 72),
 ('knowledge', 64),
 ('develop', 62),
 ('critically', 60),
 ('evaluate', 60),
 ('research', 58),
 ('issues', 56),
 ('development', 55),
 ('including', 55),
 ('practice', 54),
 ('identify', 53),
 ('analysis', 46),
 ('demonstrate', 46),
 ('work', 44),
 ('principles', 43),
 ('range', 42),
 ('key', 42),
 ('design', 41),
 ('techniques', 41),
 ('topics', 41),
 ('concepts', 41),
 ('methods', 40),
 ('professional', 40),
 ('management', 39),
 ('problems', 38),
 ('contemporary', 37),
 ('critical', 37),
 ('describe', 36),
 ('social', 36),
 ('ability', 36),
 ('health', 35),
 ('role', 34),
 ('strategies', 34),
 ('communication', 34),
 ('project', 33),
 ('systems', 33),
 ('relevant', 33),
 ('discuss', 33),
 ('theories', 32),
 ('theoretical', 32),
 ('explain', 32),
 ('understand', 32),
 ('environment', 32),
 ('major', 31),
 ('effectively', 31),
 ('studies', 31),
 ('data', 31),
 ('information

In [19]:
df_high = 200*0.95
df_low = 200*0.05
undesired_df_token = [w for w,df in doc_freq.most_common() if df < df_low or df > df_high]
print(f'# of undesired tokens based on DF: {len(undesired_df_token)}')

# of undesired tokens based on DF: 3770


In [20]:
unit_tokens_Df = {u_code:token_Filter(tokens, undesired_df_token) for u_code,tokens in unit_tokens_Sw.items()}

In [21]:
corpus_tokens_Df = list(chain.from_iterable(unit_tokens_Df.values()))
print (f'Vocabulary size with accept df : {len(set(corpus_tokens_Df))}'\
       f'\nTotal # of tokens with accept df : {len(corpus_tokens_Df)}'\
       f'\nLexical diversity: {len(corpus_tokens_Df)/len(set(corpus_tokens_Df))}')

Vocabulary size with accept df : 275
Total # of tokens with accept df : 8681
Lexical diversity: 31.567272727272726


> **<font color=blue>Too-short Tokens Removal </font>**
> * We only keep the tokens whose length is equal to or greater than 3.

In [22]:
unit_tokens_gt3 = {u_code: [w for w in tokens if len(w)>=3] for u_code,tokens in unit_tokens_Df.items()}

In [23]:
corpus_tokens_gt3 = list(chain.from_iterable(unit_tokens_gt3.values()))
print (f'Vocabulary size with accept length : {len(set(corpus_tokens_gt3))}'\
       f'\nTotal # of tokens with accept length: {len(corpus_tokens_gt3)}'\
       f'\nLexical diversity: {len(corpus_tokens_gt3)/len(set(corpus_tokens_gt3))}')

Vocabulary size with accept length : 273
Total # of tokens with accept length: 8659
Lexical diversity: 31.71794871794872


### 6.5 Stemmer porter to stemming
> * In the final stage, we use **PorterStemmer** for stemming.

In [24]:
stemmer = PorterStemmer()
unit_tokens_stemmed = {u_code: [stemmer.stem(w) if w[0].islower() else w for w in tokens] \
                       for u_code,tokens in unit_tokens_gt3.items()} 

In [25]:
corpus_tokens_final = list(chain.from_iterable(unit_tokens_stemmed.values()))
print (f'Final vocabulary size: {len(set(corpus_tokens_final))}'\
       f'\nTotal # of tokens: {len(corpus_tokens_final)}'\
       f'\nLexical diversity: {len(corpus_tokens_final)/len(set(corpus_tokens_final))}')

Final vocabulary size: 217
Total # of tokens: 8659
Lexical diversity: 39.903225806451616


In [26]:
# double check the length of final tokens
if not [w for w in corpus_tokens_final if len(w)<3]:
    print('All tokens length is greater than 3.')

All tokens length is greater than 3.


## 7. Write vocab.txt  and countVec.txt
> * First, we can write our corpus vocab into `vocab.txt` file.

In [27]:
corpus_tokens_sorted = sorted(set(corpus_tokens_final))

In [114]:
my_id = '30086434'
with open(f'{my_id}_vocab.txt', 'w') as f:
    for i,token in enumerate(corpus_tokens_sorted):
        line = f'{token}:{i}\n'
        f.write(line)

> * Then, we apply **CountVectorizer** to generate the `countVec.txt` file.

In [115]:
u_codes = []
tokens = []
for u_code, token in unit_tokens_stemmed.items():
    u_codes.append(u_code)
    txt = ' '.join(token)
    tokens.append(txt)

In [116]:
count_vectorizer = CountVectorizer( analyzer = 'word', lowercase=False)
count_vectors = count_vectorizer.fit_transform(tokens)
count_vectors.shape

(197, 217)

In [117]:
vocab = count_vectorizer.get_feature_names()
cx = count_vectors.tocoo() # return the coordinate representation of a sparse matrix
dic_write = {u_codes[i]:[] for i in range(197)}
for i,j,v in itertools.zip_longest(cx.row, cx.col, cx.data):
    dic_write[u_codes[i]].append((corpus_tokens_sorted.index(vocab[j]),(v)))
for v in dic_write.values():
    v.sort(key=lambda k: k[0])

In [118]:
with open(f'{my_id}countVec.txt', 'w') as f:
    for k,v in dic_write.items():
        line = f'{k},'
        for token_pair in v:
            line += f'{token_pair[0]}:{token_pair[1]},'
        line += "\n"
        f.write(line)

## 8. Conclusion
> What we have done in this task:
1. We create a **logic map** to handle the pre-processing problems, break them into component parts.
2. First, the pdf file is parsed to get the raw text data.
3. Then, We use **NLTK** to play with human language data:

> * Sentence segmentation is applied for the purpose of normalization;  
> * We tokenize data by words;
> * The bigrams are identified and followed by retokenization;
> * We remove bad features based on stopwords and document frequency;
> * Finally, stemming is used for reducing inflected words to their word stem.
     
> After all the steps above, we've prepared our raw text data for topic analysis or sentiment analysis.