In [1]:
import pandas as pd
import numpy as np
import csv
import re

In [2]:
DATASET_PATH = './Assets/Datasets/dataset_'

In [3]:
with open(DATASET_PATH, 'r') as file:
    for idx, line in enumerate(file):
        line = line.strip()
        print(f'{line} --> idx: {idx}')
        if idx == 30:
            break

% Context --> idx: 0
% The dataset provides user reviews on specific drugs along with related conditions, side effects, age, sex, and ratings reflecting overall patient satisfaction. --> idx: 1
% Content --> idx: 2
% Data was acquired by scraping WebMD site. There are around 0.36 million rows of unique reviews and is updated till Mar 2020. --> idx: 3
% Inspiration --> idx: 4
% This dataset intended to answer following questions: --> idx: 5
% I. Identifying the condition of the patient based on drug reviews? --> idx: 6
% II. How to predict drug rating based on patients reviews? --> idx: 7
% III. How to visualize drug rating, kind of drugs, types of conditions a patient can have, sentiments based on reviews --> idx: 8
@RELATION WebMD-Drug-Reviews-Dataset --> idx: 9
 --> idx: 10
@ATTRIBUTE Age STRING --> idx: 11
@ATTRIBUTE Condition STRING --> idx: 12
@ATTRIBUTE Date STRING --> idx: 13
@ATTRIBUTE Drug STRING --> idx: 14
@ATTRIBUTE DrugId INTEGER --> idx: 15
@ATTRIBUTE EaseofUse INTEGER --

In [145]:
final_data = []
exper_data = []
columns = [
    'Age', 
    'Condition', 
    'Date', 
    'Drug', 
    'DrugId', 
    'EaseofUse', 
    'Effectiveness', 
    'Reviews', 
    'Satisfaction', 
    'Sex', 
    'Sides', 
    'UsefulCount'
]

def extract_sex(description):
    # Bersihkan spasi di awal dan akhir
    description = description.strip()
    # Cek apakah hanya berisi "male" atau "female"
    if re.fullmatch(r'male|female', description, flags=re.I):
        return description.lower()
    # Jika string kosong atau hanya spasi, kembalikan string kosong
    elif description == "":
        return np.NaN

print(columns[-3:])
with open(DATASET_PATH, 'r') as file:
    # Skip lines until the '@DATA' line
    for line in file:
        if line.strip() == '@DATA':
            break
        
    # Read the actual data lines
    reader = csv.reader(file, delimiter=',', quotechar="'", doublequote=True)
    for idx, row in enumerate(reader):
        data_prep=row[:7] + [''.join(row[7:-4])] + row[-4:]
        sex_value = data_prep[-3].strip()

        # if idx >= 500:
        #     break

        if (re.fullmatch(r'male|female', sex_value, flags=re.I)) or (sex_value == ''):
            final_data.append(data_prep)
            # print(data_prep[-3:])
        else:
            exper_data.append(data_prep)
            # print(data_prep[-3:])
            # print()


['Sex', 'Sides', 'UsefulCount']


In [146]:
df = pd.DataFrame(final_data, columns=columns)

In [147]:
df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,I\m a retired physician and of all the meds I ...,5,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,5,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,65-74,Other,7/16/2012,warfarin (bulk) 100 powder,144731,2,3,why did my PTINR go from a normal of 2.5 to ov...,3,Female,,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 powder,144731,2,2,FALLING AND DON\T REALISE IT',1,Female,,0
4,35-44,Other,1/6/2009,warfarin (bulk) 100 powder,144731,1,1,My grandfather was prescribed this medication ...,1,Male,,1


In [148]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362639 entries, 0 to 362638
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Age            362639 non-null  object
 1   Condition      362639 non-null  object
 2   Date           362639 non-null  object
 3   Drug           362639 non-null  object
 4   DrugId         362639 non-null  object
 5   EaseofUse      362639 non-null  object
 6   Effectiveness  362639 non-null  object
 7   Reviews        362639 non-null  object
 8   Satisfaction   362639 non-null  object
 9   Sex            362639 non-null  object
 10  Sides          362639 non-null  object
 11  UsefulCount    362639 non-null  object
dtypes: object(12)
memory usage: 399.5 MB


In [163]:
df.nunique()

Age                  12
Condition          1806
Date               4525
Drug               7096
DrugId             6573
EaseofUse            13
Effectiveness         7
Reviews          250042
Satisfaction          7
Sex                   3
Sides              1651
UsefulCount         148
dtype: int64

In [165]:
display(df['Age'].value_counts())
display(df['EaseofUse'].value_counts())
display(df['Effectiveness'].value_counts())
display(df['Satisfaction'].value_counts())
display(df['Sex'].value_counts())

Age
45-54         80007
55-64         75075
35-44         54982
25-34         49707
65-74         41201
19-24         24229
75 or over    15218
              12201
13-18          7040
7-12           1644
3-6             838
0-2             497
Name: count, dtype: int64

EaseofUse
5         192527
4          74708
3          41286
1          35906
2          18179
62261         18
154215         6
11147          3
6              2
10149          1
10             1
169930         1
57948          1
Name: count, dtype: int64

Effectiveness
5     130321
4      81785
3      60386
1      59354
2      30790
6          2
10         1
Name: count, dtype: int64

Satisfaction
5     111487
1     100856
4      63128
3      51837
2      35328
6          2
10         1
Name: count, dtype: int64

Sex
Female    238119
Male       98000
           26520
Name: count, dtype: int64

In [161]:
df.isna().sum()

Age              0
Condition        0
Date             0
Drug             0
DrugId           0
EaseofUse        0
Effectiveness    0
Reviews          0
Satisfaction     0
Sex              0
Sides            0
UsefulCount      0
dtype: int64

In [149]:
cleaned_df = df.map(lambda x: np.NaN if re.search(r'^\s*$', x, flags=re.I) else x)

In [150]:
cleaned_df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,I\m a retired physician and of all the meds I ...,5,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,5,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,65-74,Other,7/16/2012,warfarin (bulk) 100 powder,144731,2,3,why did my PTINR go from a normal of 2.5 to ov...,3,Female,,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 powder,144731,2,2,FALLING AND DON\T REALISE IT',1,Female,,0
4,35-44,Other,1/6/2009,warfarin (bulk) 100 powder,144731,1,1,My grandfather was prescribed this medication ...,1,Male,,1


In [151]:
cleaned_df.shape

(362639, 12)

In [152]:
cleaned_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362639 entries, 0 to 362638
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Age            350438 non-null  object
 1   Condition      362596 non-null  object
 2   Date           362639 non-null  object
 3   Drug           362639 non-null  object
 4   DrugId         362639 non-null  object
 5   EaseofUse      362639 non-null  object
 6   Effectiveness  362639 non-null  object
 7   Reviews        320809 non-null  object
 8   Satisfaction   362639 non-null  object
 9   Sex            336119 non-null  object
 10  Sides          345184 non-null  object
 11  UsefulCount    362639 non-null  object
dtypes: object(12)
memory usage: 397.1 MB


In [153]:
cleaned_df.isna().sum()

Age              12201
Condition           43
Date                 0
Drug                 0
DrugId               0
EaseofUse            0
Effectiveness        0
Reviews          41830
Satisfaction         0
Sex              26520
Sides            17455
UsefulCount          0
dtype: int64

In [154]:
cleaned_df.nunique()

Age                  11
Condition          1805
Date               4525
Drug               7096
DrugId             6573
EaseofUse            13
Effectiveness         7
Reviews          250039
Satisfaction          7
Sex                   2
Sides              1650
UsefulCount         148
dtype: int64

In [157]:
cleaned_df['Sex'].value_counts()

Sex
Female    238119
Male       98000
Name: count, dtype: int64

In [94]:
cleaned_df['Effectiveness'].value_counts()

Effectiveness
5    120829
4     76425
3     56096
1     54300
2     28467
6         2
Name: count, dtype: int64

In [97]:
cleaned_df['Condition'].value_counts()

Condition
Other                                                                             47648
Pain                                                                              23624
High Blood Pressure                                                               21627
Depression                                                                        13807
Birth Control                                                                     11192
                                                                                  ...  
Hospital-Acquired Pseudomonas Aeruginosa Pneumonia Treated with Multiple Drugs        1
Bacterial Blood Infection caused by Pseudomonas Aeruginosa                            1
Defect of Connective Tissue - Noonan\s Syndrome'                                      1
Infection of Female Pelvic Organs caused by Klebsiella                                1
Complicated Skin Infection due to Peptostreptococcus Bacteria                         1
Name: count, Length: 1

In [166]:
exper_df = pd.DataFrame(exper_data, columns=columns)

In [168]:
exper_df.shape

(167, 12)

In [167]:
exper_df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,45-54,"Disorder characterized by Stiff, Tender Painf...",2/18/2010,lyrica,93965,5,5,I hope I never stop taking this med. It takes ...,constipation,difficulty concentrating,or weight gain may occur.',1
1,55-64,Neuropathic Pain,3/12/2008,lyrica,93965,5,5,I have compressed nerve in my lower back due t...,constipation,difficulty concentrating,or weight gain may occur.',21
2,65-74,High Amount of Triglyceride in the Blood,12/8/2008,lovaza,148529,5,2,2 years ago I took lovaza and my colestrol wen...,",2,Female,Upset stomach",burping,and strange taste in mouth may occur.',6
3,55-64,High Blood Pressure,3/16/2009,lisinopril solution,6873,5,3,The first couple of months Lisinopril worked f...,lightheadedness,tiredness,or headache may occur as your body adjusts ...,12
4,55-64,High Blood Pressure,2/21/2010,lisinopril solution,6873,4,4,Really don\t know if the issues I have is a si...,lightheadedness,tiredness,or headache may occur as your body adjusts ...,0


In [176]:
exper_df.nunique()

Age                9
Condition         60
Date             120
Drug             136
DrugId           103
EaseofUse          5
Effectiveness      5
Reviews          120
Satisfaction      67
Sex               63
Sides             88
UsefulCount       27
dtype: int64

In [179]:
display(exper_df['Satisfaction'].value_counts())
display(exper_df['Sex'].value_counts())
display(exper_df['Sides'].value_counts())

Satisfaction
  lightheadedness                          27
  constipation                             12
5                                          11
  dizziness                                10
  breast  tenderness                        8
                                           ..
  muscle pain                               1
4                                           1
 it causes my blood sugar to bottom out     1
  heartburn                                 1
2                                           1
Name: count, Length: 67, dtype: int64

Sex
  dizziness                                                                                                                                        24
  headache                                                                                                                                         14
 drowsiness                                                                                                                                         8
 tiredness                                                                                                                                          7
  weight  gain                                                                                                                                      6
                                                                                                                                                   ..
 I have been on Theragran more than six months and it is the longest I\ve gone without a urinary

Sides
 or drowsiness may occur.'                                              19
 or  headache  may occur.'                                               6
 or  weight  changes may occur.'                                         5
 or  trouble sleeping  may occur.'                                       5
 or change in  sex drive /ability may occur.'                            4
                                                                        ..
,5,Female,Dizziness  may occur.'                                         1
,4,Male,Dizziness  may occur.'                                           1
,5,Female, '                                                             1
 or  dizziness  during and after placement of the device may occur.'     1
 or changes in taste may occur.'                                         1
Name: count, Length: 88, dtype: int64

In [270]:
pattern = r'^(.+?),(\d+),(\w+),(.+)$'
test_data = []
for value in exper_df.loc[:, ['Reviews', 'Satisfaction', 'Sex', 'Sides']].values:
    new_value = ''.join(value)
    # print(new_value)
    match = re.match(pattern, new_value)
    if match:
        test_data.append(match.groups())
        # print(list(match.groups()))
    # else:
    #     test_data.append(new_value)
        # print([new_value])

In [272]:
exper_df.columns

Index(['Age', 'Condition', 'Date', 'Drug', 'DrugId', 'EaseofUse',
       'Effectiveness', 'Reviews', 'Satisfaction', 'Sex', 'Sides',
       'UsefulCount'],
      dtype='object')

In [278]:
fix_exper_df = exper_df[['Age', 'Condition', 'Date', 'Drug', 'DrugId', 'EaseofUse', 'Effectiveness', 'Reviews', 'UsefulCount']]

In [280]:
fix_exper_df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,UsefulCount
0,45-54,"Disorder characterized by Stiff, Tender Painf...",2/18/2010,lyrica,93965,5,5,I hope I never stop taking this med. It takes ...,1
1,55-64,Neuropathic Pain,3/12/2008,lyrica,93965,5,5,I have compressed nerve in my lower back due t...,21
2,65-74,High Amount of Triglyceride in the Blood,12/8/2008,lovaza,148529,5,2,2 years ago I took lovaza and my colestrol wen...,6
3,55-64,High Blood Pressure,3/16/2009,lisinopril solution,6873,5,3,The first couple of months Lisinopril worked f...,12
4,55-64,High Blood Pressure,2/21/2010,lisinopril solution,6873,4,4,Really don\t know if the issues I have is a si...,0


In [279]:
fix_test_df = pd.DataFrame(test_data, columns=['Reviews_2', 'Satisfaction', 'Sex', 'Sides'])

In [281]:
fix_test_df[:5]

Unnamed: 0,Reviews_2,Satisfaction,Sex,Sides
0,I hope I never stop taking this med. It takes ...,5,Female,Drowsiness dizziness dry mouth constipati...
1,I have compressed nerve in my lower back due t...,5,Female,Drowsiness dizziness dry mouth constipati...
2,2 years ago I took lovaza and my colestrol wen...,2,Female,Upset stomach burping and strange taste in m...
3,The first couple of months Lisinopril worked f...,1,Female,Dizziness lightheadedness tiredness or hea...
4,Really don\t know if the issues I have is a si...,4,Male,Dizziness lightheadedness tiredness or hea...


In [285]:
final_exper_df = pd.concat([fix_exper_df, fix_test_df], axis=1)
final_exper_df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,UsefulCount,Reviews_2,Satisfaction,Sex,Sides
0,45-54,"Disorder characterized by Stiff, Tender Painf...",2/18/2010,lyrica,93965,5,5,I hope I never stop taking this med. It takes ...,1,I hope I never stop taking this med. It takes ...,5,Female,Drowsiness dizziness dry mouth constipati...
1,55-64,Neuropathic Pain,3/12/2008,lyrica,93965,5,5,I have compressed nerve in my lower back due t...,21,I have compressed nerve in my lower back due t...,5,Female,Drowsiness dizziness dry mouth constipati...
2,65-74,High Amount of Triglyceride in the Blood,12/8/2008,lovaza,148529,5,2,2 years ago I took lovaza and my colestrol wen...,6,2 years ago I took lovaza and my colestrol wen...,2,Female,Upset stomach burping and strange taste in m...
3,55-64,High Blood Pressure,3/16/2009,lisinopril solution,6873,5,3,The first couple of months Lisinopril worked f...,12,The first couple of months Lisinopril worked f...,1,Female,Dizziness lightheadedness tiredness or hea...
4,55-64,High Blood Pressure,2/21/2010,lisinopril solution,6873,4,4,Really don\t know if the issues I have is a si...,0,Really don\t know if the issues I have is a si...,4,Male,Dizziness lightheadedness tiredness or hea...


In [292]:
final_exper_df.isna().sum()

Age               0
Condition         0
Date              0
Drug              0
DrugId            0
EaseofUse         0
Effectiveness     0
Reviews           0
UsefulCount       0
Reviews_2        17
Satisfaction     17
Sex              17
Sides            17
dtype: int64

In [322]:
a = final_exper_df['Reviews'] + '|============================|' + final_exper_df['Reviews_2']
a[50]

