## Load a local dataset

In [1]:
from datasets import load_dataset

data_files = {"train": "data/drugsComTrain_raw.tsv", "test": "data/drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [3]:
#Undestand what's Unamed0
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [4]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset


DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [5]:
print(len(drug_dataset["train"].unique("drugName")))
print(len(drug_dataset["train"].unique("condition")))

3436
885


In [6]:
def filter_nones(x):
    return x["condition"] is not None

drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

Filter: 100%|██████████| 161297/161297 [00:01<00:00, 144517.38 examples/s]
Filter: 100%|██████████| 53766/53766 [00:00<00:00, 142791.26 examples/s]


In [7]:
#Normalization of condition
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}


drug_dataset.map(lowercase_condition)
drug_dataset["train"]["condition"][:3]

Map: 100%|██████████| 160398/160398 [00:13<00:00, 12218.50 examples/s]
Map: 100%|██████████| 53471/53471 [00:04<00:00, 12368.71 examples/s]


['Left Ventricular Dysfunction', 'ADHD', 'Birth Control']

In [8]:
#Create new cols that count the number of words per reviews 
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

Map: 100%|██████████| 160398/160398 [00:14<00:00, 10994.04 examples/s]
Map: 100%|██████████| 53471/53471 [00:04<00:00, 10893.97 examples/s]


{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [9]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['Hepatitis C', 'ADHD', 'Birth Control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [10]:
#Sort the dataset to only have reviews with +30 words
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

Filter: 100%|██████████| 160398/160398 [00:01<00:00, 115497.13 examples/s]
Filter: 100%|██████████| 53471/53471 [00:00<00:00, 111609.96 examples/s]

{'train': 138514, 'test': 46108}





In [11]:
#Check reviews with most words
drug_dataset["train"].sort(column_names="review_length", reverse=True)[:3]

{'patient_id': [121004, 181160, 216072],
 'drugName': ['Venlafaxine', 'Prozac', 'Copper'],
 'condition': ['Migraine', 'Obsessive Compulsive Disorde', 'Birth Control'],
 'review': ['"Two and a half months ago I was prescribed Venlafaxine to help prevent chronic migraines.\r\nIt did help the migraines (reduced them by almost half), but with it came a host of side effects that were far worse than the problem I was trying to get rid of.\r\nHaving now come off of the stuff, I would not recommend anyone ever use Venlafaxine unless they suffer from extreme / suicidal depression. I mean extreme in the most emphatic sense of the word. \r\nBefore trying Venlafaxine, I was a writer. While on Venlafaxine, I could barely write or speak or communicate at all. More than that, I just didn&#039;t want to. Not normal for a usually outgoing extrovert.\r\nNow, I&#039;m beginning to write again - but my ability to speak and converse with others has deteriorated by about 95%. Writing these words is taking f

In [12]:
#Deal with HTLM character
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [13]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

Map: 100%|██████████| 138514/138514 [00:13<00:00, 10334.46 examples/s]
Map: 100%|██████████| 46108/46108 [00:04<00:00, 10449.31 examples/s]


In [14]:
#Try map with batch to see if it's faster
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

Map: 100%|██████████| 138514/138514 [00:00<00:00, 378983.41 examples/s]
Map: 100%|██████████| 46108/46108 [00:00<00:00, 415259.61 examples/s]


In [15]:
#It's way faster with batch, lets see the difference when we tokenize our dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)

Map: 100%|██████████| 138514/138514 [00:15<00:00, 8988.67 examples/s]
Map: 100%|██████████| 46108/46108 [00:06<00:00, 7629.61 examples/s]


CPU times: total: 1min 17s
Wall time: 21.5 s


Map: 100%|██████████| 138514/138514 [01:06<00:00, 2077.35 examples/s]
Map: 100%|██████████| 46108/46108 [00:21<00:00, 2124.44 examples/s]

CPU times: total: 43 s
Wall time: 1min 28s





In [23]:
#Convert the dataset to Dataframe
drug_dataset.set_format("pandas")

In [24]:
#Now we can access it as a DF
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89
