# Data preprocessing

## Ohsumed collection
it includes medical abstracts from the MeSH categories of 
the year 1991. In [Joachims, 1997] were used the first 20,000 documents divided
 in 10,000 for training and 10,000 for testing. The specific task was to categorize 
the 23 cardiovascular diseases categories. After selecting the such category 
subset, the unique abstract number becomes 13,929 (6,286 for training and 
7,643 for testing). As current computers can easily manage larger number of 
documents we make available all 34,389 cardiovascular diseases abstracts 
out of 50,216 medical abstracts contained in the year 1991.

"Bacterial Infections and Mycoses                      C01
Virus Diseases                                        C02
Parasitic Diseases                                    C03
Neoplasms                                             C04
Musculoskeletal Diseases                              C05
Digestive System Diseases                             C06
Stomatognathic Diseases                               C07
Respiratory Tract Diseases                            C08
Otorhinolaryngologic Diseases                         C09
Nervous System Diseases                               C10
Eye Diseases                                          C11
Urologic and Male Genital Diseases                    C12
Female Genital Diseases and Pregnancy Complications   C13
Cardiovascular Diseases                               C14
Hemic and Lymphatic Diseases                          C15
Neonatal Diseases and Abnormalities                   C16
Skin and Connective Tissue Diseases                   C17
Nutritional and Metabolic Diseases                    C18
Endocrine Diseases                                    C19
Immunologic Diseases                                  C20
Disorders of Environmental Origin                     C21
Animal Diseases                                       C22
Pathological Conditions, Signs and Symptoms           C23"

In [1]:
import os
import pandas as pd

### ohsumed-file-category

In [None]:
# category_list = sorted([name for name in os.listdir(".") if os.path.isdir(name)])
category_list = ['C01', 'C02', 'C03', 'C04', 'C05', 'C06', 'C07', 'C08', 'C09', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23']
df = pd.DataFrame([], columns = ["file_name"] + category_list)

category_len = len(category_list)
category_index_dict = dict(zip(category_list, list(range(category_len))))

for category in category_list:
    file_names = os.listdir('./data-original/' + category)
    for file in file_names:
        df_file_name = df.loc[(df['file_name'] == file)]
        if df_file_name.empty:
            data = [0] * category_len
            data[category_index_dict[category]] = 1
            data = [file] + data
            df = df.append(pd.DataFrame([data], columns = ["file_name"] + category_list), ignore_index=True)
        else:
            # chack for duplicates
            if len(df_file_name) > 1:
                raise Exception("Duplicates have found!")
            else:
                file_index = df.index[df['file_name'] == file][0]
                df.at[file_index, category] = 1

# df.to_csv('ohsumed-file-category.csv', index=False)

### ohsumed-file-abstract-category

In [None]:
# category_list = [name for name in os.listdir(os.listdir("./data-original/.")) if os.path.isdir(os.listdir("./data-original/"+name))]
df = pd.DataFrame([], columns = ["file_name", "title_abstract"] + category_list)

category_len = len(category_list)
category_index_dict = dict(zip(category_list, list(range(category_len))))

for category in category_list:
    file_names = os.listdir('./data-original/'+category)
    for file in file_names:
        df_file_name = df.loc[(df['file_name'] == file)]
        if df_file_name.empty:
            data = [0] * category_len
            data[category_index_dict[category]] = 1
            if file[0] == '.':
                continue
            abstract = open('./data-original/'+category+'/'+file, 'r').read()
            data = [file, abstract] + data
            df = df.append(pd.DataFrame([data], columns = ["file_name", "title_abstract"] + category_list), ignore_index=True)
        else:
            # chack for duplicates
            if len(df_file_name) > 1:
                raise Exception("Duplicates have found!")
            else:
                file_index = df.index[df['file_name'] == file][0]
                df.at[file_index, category] = 1
                
# df.to_csv('ohsumed-file-abstract-category.csv', index=False)

## Cleaning and creating N-grams

In [2]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
from gensim.models import Phrases
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
import string
import re
from collections import Counter

In [3]:
df = pd.read_csv('data-processed/ohsumed-file-abstract-category.csv')

Celaning raw text

In [4]:
text_cleaned = []

# Replace all numbers with special strings
regx = re.compile(r"\b[\d.]+\b")
porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

for _, row in df.iterrows():
    text = row['title_abstract'].replace('Copyright', '').split('©', 1)[0]
    # with stemming
#     text = [porter.stem(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
#     # without stemming
#     text = [word.strip() for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
    # with lemmatizer
    text = [wordnet_lemmatizer.lemmatize(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
         
    text_cleaned.append(text)

In [50]:
# # Findining Phrases (ie bi-grams)
# # train bi-grams
# bigram = Phrases()
# bigram.add_vocab(text_cleaned)

# # create phrases
# text_cleaned_phrases = []
# for text_ in text_cleaned:
#     text_cleaned_phrases.append(bigram[text_])

## Generate N-grams for abstracts

In [5]:
def generate_ngrams(text_cleaned, n):
    text_tokenized = []
    for tokens in text_cleaned:
        s = ''
        for ngram in nltk.ngrams(tokens, n):
            s = s + '_'.join(str(i) for i in ngram) + ' '
        text_tokenized.append(s)
    return text_tokenized

In [None]:
n = 1 # n-grams
criteria = ['C04', 'C12'] # categories

text_tokenized = generate_ngrams(text_cleaned, n)
df_ngrams = df[['file_name'] + criteria]
df_ngrams['Y'] = df_ngrams.apply(lambda r: 1 if r[criteria[0]]+r[criteria[1]] == 2 else 0, axis=1)
df_ngrams['tokens'] = pd.Series(text_tokenized, index=df_ngrams.index)
df_ngrams.to_csv('ohsumed_{}_{}_{}grams.csv'.format(criteria[0], criteria[1], n), index=False)