In [19]:
import pandas as pd
import numpy as np
import sentencepiece as spm
import nltk
import ast
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import time

In [20]:
ncbi_com_0 = pd.read_csv("data/ncbi_comm_use_000000000000.csv")

In [21]:
ncbi_com_1 = pd.read_csv("data/ncbi_comm_use_000000000001.csv")

In [22]:
ncbi_non_com_0 =  pd.read_csv("data/ncbi_non_comm_use_000000000000.csv")

In [23]:
ncbi_non_com_1 =  pd.read_csv("data/ncbi_non_comm_use_000000000001.csv")

In [24]:
ncbi_com_0.shape

(5958, 5)

In [25]:
ncbi_com_1.shape

(4790, 5)

In [26]:
ncbi_non_com_0.shape

(7924, 5)

In [9]:
df1 = ncbi_com_0

In [29]:
df1.shape

(5958, 5)

In [30]:
df2 = pd.concat([df1, ncbi_com_1])

In [31]:
df2.shape

(10748, 5)

In [32]:
df3 = pd.concat([df2, ncbi_non_com_0])

In [33]:
df3.shape

(18672, 5)

In [34]:
df4 = pd.concat([df3, ncbi_non_com_1])

In [35]:
df4.shape

(28117, 5)

In [36]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5958 entries, 0 to 5957
Data columns (total 5 columns):
Refs        5807 non-null object
Body        5937 non-null object
Front       5940 non-null object
Meta        18 non-null object
Filename    5958 non-null object
dtypes: object(5)
memory usage: 232.9+ KB


In [55]:
df1.isnull().sum()

Refs         151
Body          21
Front         18
Meta        5940
Filename       0
dtype: int64

In [38]:
df1.head(1)

Unnamed: 0,Refs,Body,Front,Meta,Filename
0,,,,"52\nStandardized ""Malhotra-Wig Vignettes"" for ...",comm_use.I-N.txt.tar.gz-unpacked/Indian_J_Psyc...


In [39]:
df2.head(2)

Unnamed: 0,Refs,Body,Front,Meta,Filename
0,,,,"52\nStandardized ""Malhotra-Wig Vignettes"" for ...",comm_use.I-N.txt.tar.gz-unpacked/Indian_J_Psyc...
1,,,,5\nSpirituality and Psychiatry\nDinesh Bhugra*...,comm_use.I-N.txt.tar.gz-unpacked/Indian_J_Psyc...


In [45]:
ncbi_com_1['Body'][0]

"The editors of the International Journal of Molecular Sciences would like to express their sincere gratitude to the following reviewers for assessing manuscripts in 2014: Abass, Khaled\nAbbott, David H.\nAbdelmohsen, Kotb\nAbdraboh, Mohamed\nAbe, Naohito\nAbe, Toshiaki\nAbou Neel, Ensanya A.\nAbou-Alfa, Ghassan\nAbraini, Jacques H.\nAbram, Florence\nAbusco, Anna Scotto\nAbzalimov, Rinat\nAckland, K.\nAcuna-Castroviejo, Darío\nAcunzo, M.\nAdamcakova-Dodd, Andrea\nAdeniji, Adegoke\nAdessi, Alessandra\nAdhikary, Amitava\nAfseth, Nils\nAgalliu, Dritan\nAgarwal, Anika\nAgeorges, Agnès\nAgorastos, Agorastos\nAgostini, Marco\nAgterberg, Martijn J. H.\nAguilar, Claudio\nAguilar-Reina, José\nAgulló-Ortuño, M. Teresa\nAgyei, Dominic\nAhmed, Khalil\nAhmed, Salahuddin\nAhn, Joong-Hoon\nAhn, Suk-Kyun\nAibner, Thomas\nAizawa, Shin-Ichi\nAjikumar, Parayil Kumaran\nAkashi, Makoto\nAkbar, Sheikh Mohammad Fazle\nAkbarzadeh, A. H.\nAkeda, Koji\nAkimoto, Jun\nAktas, C.\nAkue, Jean Paul\nAl Ghouleh, Imad\

In [46]:
ncbi_com_1['Body'][1]

'With the goal of recognizing outstanding contributions to the field of molecular sciences by young investigators under the age of 40 (by 31 December 2016), and early-career investigators, which includes postdoctoral students and PhD students, and assisting the early-career investigators in attending international conferences in 2017, last year the International Journal of Molecular Sciences accepted nominations for Young Investigator and Travel Awards 2017. Over 200 nominations were received and were evaluated by a panel of judges comprised of International Journal of Molecular Sciences editorial board members.\nWe are excited to announce the winner of the Young Investigator award: Dr. Rob W.J. Collin who will be awarded 2000 Swiss Francs; and the following winners for Travel Awards, Dr. Yi Ma, Dr. Miranda Ween, and Dr. Reza M. Zadegan who will be supported with up to 800 Swiss Francs each towards their travel expenses to attend international conferences in 2017.\n1. Young Investigato

# Data Preprocessing

In [47]:
def remove_newline_char(text):
    text = text.replace("\n", " ")
    return text

def nltk_sent_tokenize(text):
    text = sent_tokenize(text)
    return text

def contains_coronavirus(text):
    if "coronavirus" in text.lower():
        return 1
    else:
        return 0
    
def contains_COVID(text):
    if "COVID" in text:
        return 1
    else:
        return 0

In [48]:
def preprocess(df):
    # remove rows that have null Body 
    df = df[~df['Body'].isnull()]
    df['Body'] = df['Body'].apply(remove_newline_char)
    df['Body_sents'] = df['Body'].apply(nltk_sent_tokenize)
    df['Body_tokens'] = df['Body'].apply(word_tokenize)
    df['len_body'] = df['Body_tokens'].apply(lambda x: len(x))
    df['has_coronavirus'] = df['Body'].apply(contains_coronavirus)
    df['has_COVID'] = df['Body'].apply(contains_COVID)
    df['len_sents'] = df['Body_sents'].apply(lambda x: len(x))
    return df

# Build and save corpus

In [49]:
def build_raw_corpus(df):
    raw_corpus = []
    for i, row in df.iterrows():
        raw_corpus += row['Body_sents']
    return raw_corpus

In [50]:
def save_corpus_as_txt(filename, corpus):
    with open(filename, 'w') as f:
        for sent in corpus:
            f.write(sent)
            f.write('\n')
    f.close()

In [51]:
def build_tokenizer_input(df, filename):
    raw_corpus = build_raw_corpus(df)
    save_corpus_as_txt(filename, raw_corpus)

# Train SentencePiece tokenizer

In [52]:
def train_tokenizer(model_prefix, input_file, vocab_size):
    spm.SentencePieceTrainer.train('--model_prefix={} --input={} --vocab_size={}'.format(model_prefix, 
                                                                                         input_file, vocab_size))

# Load model

In [53]:
def load_model(model_file):
    sp = spm.SentencePieceProcessor()
    sp.Load(model_file)
    return sp

# Tokenize text

In [54]:
def sp_tokenize(model, text):
    tokenized_text = model.EncodeAsPieces(text)
    return tokenized_text

# Experiments

In [56]:
# rows: 5958
# vocab_size=5000

In [None]:
# preprocess data
t1 = time.time()
df1 = preprocess(df1)
t2 = time.time()
print ("Time:", (t2-t1)/60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [41]:
# build corpus
input_file_1 = "sample_input_1.txt"
t1 = time.time()
raw_corpus_1 = build_tokenizer_input(df1, input_file_1)
t2 = time.time()
print ("Time:", (t2-t1)/60)

Time: 0.05040566523869832


In [42]:
# train sp tokenizer
model_prefix_1 = "m1"
vocab_size = 5000
t1 = time.time()
train_tokenizer(model_prefix_1, input_file_1, vocab_size)
t2 = time.time()
print ("Time:", (t2-t1)/60)

Time: 6.44949103196462


In [43]:
# load model
model_file_1 = model_prefix_1 + ".model"
sp1 = load_model(model_file_1)

In [44]:
# tokenize text
text = "This is a novel coronavirus disease."
tokenized_text = sp_tokenize(sp1, text)

In [45]:
tokenized_text

['▁This', '▁is', '▁a', '▁novel', '▁cor', 'on', 'a', 'virus', '▁disease', '.']

# Some EDA on data

In [28]:
df1.head()

Unnamed: 0,Refs,Body,Front,Meta,Filename
0,,,,"52\nStandardized ""Malhotra-Wig Vignettes"" for ...",comm_use.I-N.txt.tar.gz-unpacked/Indian_J_Psyc...
1,,,,5\nSpirituality and Psychiatry\nDinesh Bhugra*...,comm_use.I-N.txt.tar.gz-unpacked/Indian_J_Psyc...
2,,,,"84\nSir,\nThe study of Sarkar and Chandra (200...",comm_use.I-N.txt.tar.gz-unpacked/Indian_J_Psyc...
3,,,,7\nRespected Chairpersons and members of the I...,comm_use.I-N.txt.tar.gz-unpacked/Indian_J_Psyc...
4,,,,"15\nDear Chairpersons, Mr. President, Esteemed...",comm_use.I-N.txt.tar.gz-unpacked/Indian_J_Psyc...


In [48]:
df1.has_coronavirus.value_counts()

0    5923
1      14
Name: has_coronavirus, dtype: int64

In [50]:
df1.has_COVID.value_counts()

0    5937
Name: has_COVID, dtype: int64

In [51]:
np.mean(df1['len_body'])

5638.820953343439

In [52]:
np.mean(df1['len_sents'])

218.91510864072765