# BioMedBERT BigQuery Data Analysis/ Pre-training

In [67]:
# imports
import os
import json
import numpy as np
import pandas as pd
import textwrap
import tensorflow as tf
from google.cloud import bigquery

## Data Preprocessing

In [68]:
# base imports
import os
import sys
import nltk

### Preprocess text
Remove punсtuation, uppercase letters and non-utf symbols.

In [69]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
    # lowercase text
    text = str(text).lower()
    # remove non-UTF
    text = text.encode("utf-8", "ignore").decode()
    # remove punctuation symbols
    text = " ".join(regex_tokenizer.tokenize(text))
    return text

## Create Expanded csv dataset

In [70]:
import os
import glob
import pandas as pd
from google.cloud import storage

In [71]:
project_id = 'ai-vs-covid19'
storage_client = storage.Client(project=project_id)
bucket = 'ekaba-assets'



In [72]:
# !gsutil -m rm -rf gs://ekaba-assets/full_body/

In [73]:
bucket+'/full_body_text'

'ekaba-assets/full_body_text'

In [74]:
bucket=storage_client.get_bucket(bucket)
# List all objects that satisfy the filter.
blobs=bucket.list_blobs(prefix='full_body_text')

In [75]:
blob = [blob for blob in blobs]

In [76]:
print(len(blob))
print(len(blob) // 5)
print((len(blob) // 5)*5)

65
13
65


In [77]:
# blob

In [78]:
# Create a function called "chunks" with two arguments, l and n:
def split_list(data, chunk):
    # For item i in a range that is a length of data (l),
    for i in range(0, len(data), chunk):
        # Create an index range for data of chunk (e.g. 5) items:
        yield data[i:i+chunk]

In [79]:
# list of length in which we have to split 
# blob_split_1, blob_split_2, blob_split_3, blob_split_4, blob_split_5 = list(split_list(blob, 50))

In [80]:
print(len(blob[1:]))
# print(len(blob_split_1))
# print(len(blob_split_2))
# print(len(blob_split_3))
# print(len(blob_split_4))
# print(len(blob_split_5))

64


In [81]:
blob[1:5]

[<Blob: ekaba-assets, full_body_text/000000000000.csv, 1587574947891518>,
 <Blob: ekaba-assets, full_body_text/000000000001.csv, 1587574940230286>,
 <Blob: ekaba-assets, full_body_text/000000000002.csv, 1587574948556797>,
 <Blob: ekaba-assets, full_body_text/000000000003.csv, 1587574954594689>]

In [82]:
# blob_split_2[0:5]

In [83]:
def download_to_local(folder, blob_lst):
    print('File download Started…. Wait for the job to complete.')
    # Create this folder locally if not exists
    if not os.path.exists(folder):
        os.makedirs(folder)
    # Iterating through for loop one by one using API call
    for blob in blob_lst:
#         print('Blobs: {}'.format(blob.name))
        destination_uri = '{}/{}'.format(folder, (blob.name).split('/')[-1])
        blob.download_to_filename(destination_uri)
        print('Exported {} to {}'.format(blob.name, destination_uri))

In [84]:
# !rm -rf data #data_1 data_2 data_3 data_4 #data_5
# !rm ncbi_comm_use_csv_A.csv

In [86]:
# download first part of csv's
# download_to_local('data', blob[1:])
# download_to_local('data_1', blob_split_1)
# download_to_local('data_2', blob_split_2)
# download_to_local('data_3', blob_split_3)
# download_to_local('data_4', blob_split_4)
# download_to_local('data_5', blob_split_5)

In [87]:
# make combined csv
def combined_csv(data_folder):
    extension = 'csv'
    all_filenames = [i for i in glob.glob('{}/*.{}'.format(data_folder, extension))]
    #combine all files in the list
    combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
    return combined_csv

In [88]:
# blob_csv_A = combined_csv('data')
blob_csv = combined_csv('data')

In [None]:
# len(blob_csv)

In [89]:
blob_csv.to_csv('gs://ekaba-assets/full_body_text.csv')



In [None]:
# blob_csv_A.to_csv( "ncbi_comm_use_csv_A.csv", index=False, encoding='utf-8-sig')
# blob_csv.to_csv( "ncbi_comm_use.csv", index=False, encoding='utf-8-sig')

In [None]:
# copy files from gcs bucket
# !gsutil -m cp gs://ekaba-assets/ncbi_comm_use_BODY.csv .

In [None]:
# body = pd.read_csv('ncbi_comm_use.csv')

In [90]:
body_sel = blob_csv[['full_body']]

In [92]:
# body_sel.head()

In [93]:
body_sel.to_csv('gs://ekaba-assets/full_body_text_BODY.csv')

In [95]:
body_sel.to_csv( "full_body_text_BODY.csv", index=False, encoding='utf-8-sig')

In [101]:
# remove FULL ncbi_comm_use.csv
!rm full_body_text_BODY.csv

In [96]:
# convert csv to txt
import csv
import sys
maxInt = sys.maxsize
csv.field_size_limit(maxInt)

csv_file = 'full_body_text_BODY.csv'
txt_file = 'full_body_text_BODY.txt'
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r") as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

In [97]:
# move text file to GCS
# !gsutil -m cp ncbi_comm_use_BODY.txt gs://ekaba-assets/

In [98]:
# remove csv file
# !rm ncbi_comm_use_BODY.csv

In [99]:
from tensorflow.keras.utils import Progbar
def count_lines(filename):
    count = 0
    with open(filename) as fi:
        for line in fi:
            count += 1
    return count

In [102]:
# Apply normalization to entire dataset
RAW_DATA_FPATH = "full_body_text_BODY.txt"
PRC_DATA_FPATH = "processed_full_body_text_BODY.txt"

# apply normalization to the dataset

total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in fi:
      fo.write(normalize_text(l)+"\n")
      bar.add(1)



In [103]:
# move processed text file to GCS
!gsutil -m cp processed_full_body_text_BODY.txt gs://ekaba-assets/

Copying file://processed_full_body_text_BODY.txt [Content-Type=text/plain]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

| [1/1 files][ 61.4 GiB/ 61.4 GiB] 100% Done  56.6 MiB/s ETA 00:00:00           
Operation completed over 1 objects/61.4 GiB.                                     


In [105]:
# remove intermediate files
!rm processed_full_body_text_BODY.txt #full_body_text_BODY.txt #processed_ncbi_comm_use_BODY.txt