In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('../../res/train.csv')
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [2]:
# Data cleaning
df = df.drop(['uniqueID', 'date'], axis=1)
df.columns = ["drug_name", "condition", "review", "rating", "useful_count"]
df.drug_name = df.drug_name.str.lower()
df.condition = df.condition.str.lower()

print(f"There are {len(df)} reviews.")
df.head()

There are 161297 reviews.


Unnamed: 0,drug_name,condition,review,rating,useful_count
0,valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9,27
1,guanfacine,adhd,"""My son is halfway through his fourth week of ...",8,192
2,lybrel,birth control,"""I used to take another oral contraceptive, wh...",5,17
3,ortho evra,birth control,"""This is my first time using any form of birth...",8,10
4,buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9,37


In [3]:
df.dtypes

drug_name       object
condition       object
review          object
rating           int64
useful_count     int64
dtype: object

In [4]:
# Number of unique drug treatments
len(df.drug_name.unique())

3436

In [5]:
# Number of unique conditions
len(df.condition.unique())

885

In [6]:
# Some drugs are given together as treatment, denoted by "/"
for i, drug_name in enumerate(df.drug_name.unique()):
    print(i, drug_name)

0 valsartan
1 guanfacine
2 lybrel
3 ortho evra
4 buprenorphine / naloxone
5 cialis
6 levonorgestrel
7 aripiprazole
8 keppra
9 ethinyl estradiol / levonorgestrel
10 topiramate
11 l-methylfolate
12 pentasa
13 dextromethorphan
14 nexplanon
15 liraglutide
16 trimethoprim
17 amitriptyline
18 lamotrigine
19 nilotinib
20 atripla
21 trazodone
22 etonogestrel
23 etanercept
24 tioconazole
25 azithromycin
26 eflornithine
27 daytrana
28 ativan
29 imitrex
30 sertraline
31 toradol
32 viberzi
33 mobic
34 dulcolax
35 morphine
36 moviprep
37 trilafon
38 fluconazole
39 contrave
40 clonazepam
41 metaxalone
42 venlafaxine
43 ledipasvir / sofosbuvir
44 symbyax
45 tamsulosin
46 doxycycline
47 dulaglutide
48 intuniv
49 buprenorphine
50 qvar
51 opdivo
52 pyridium
53 latuda
54 bupropion
55 implanon
56 effexor xr
57 drospirenone / ethinyl estradiol
58 nuvaring
59 prepopik
60 tretinoin
61 gildess fe 1 / 20
62 ethinyl estradiol / norgestimate
63 elbasvir / grazoprevir
64 clomiphene
65 docusate / senna
66 amitiza


In [7]:
# Some of the conditions seem garbaged, see: "</span> users found this comment helpful."
for i, condition in enumerate(df.condition.unique()):
    print(i, condition)

0 left ventricular dysfunction
1 adhd
2 birth control
3 opiate dependence
4 benign prostatic hyperplasia
5 emergency contraception
6 bipolar disorde
7 epilepsy
8 migraine prevention
9 depression
10 crohn's disease
11 cough
12 obesity
13 urinary tract infection
14 ibromyalgia
15 chronic myelogenous leukemia
16 hiv infection
17 insomnia
18 rheumatoid arthritis
19 vaginal yeast infection
20 chlamydia infection
21 hirsutism
22 panic disorde
23 migraine
24 nan
25 pain
26 irritable bowel syndrome
27 osteoarthritis
28 constipation
29 bowel preparation
30 psychosis
31 muscle spasm
32 hepatitis c
33 overactive bladde
34 diabetes, type 2
35 asthma, maintenance
36 non-small cell lung cance
37 schizophrenia
38 dysuria
39 smoking cessation
40 anxiety
41 acne
42 emale infertility
43 constipation, acute
44 constipation, drug induced
45 erectile dysfunction
46 trigeminal neuralgia
47 underactive thyroid
48 chronic pain
49 atrophic vaginitis
50 skin and structure infection
51 tinnitus
52 major depressi

In [8]:
# The condition column has some NaN values
df.notna().all()

drug_name        True
condition       False
review           True
rating           True
useful_count     True
dtype: bool