In [2]:
import pandas as pd
import numpy as np
import csv
import re

In [3]:
DATASET_PATH = './Assets/Datasets/dataset_'

In [4]:
with open(DATASET_PATH, 'r') as file:
    for idx, line in enumerate(file):
        line = line.strip()
        print(f'{line} --> idx: {idx}')
        if idx == 30:
            break

% Context --> idx: 0
% The dataset provides user reviews on specific drugs along with related conditions, side effects, age, sex, and ratings reflecting overall patient satisfaction. --> idx: 1
% Content --> idx: 2
% Data was acquired by scraping WebMD site. There are around 0.36 million rows of unique reviews and is updated till Mar 2020. --> idx: 3
% Inspiration --> idx: 4
% This dataset intended to answer following questions: --> idx: 5
% I. Identifying the condition of the patient based on drug reviews? --> idx: 6
% II. How to predict drug rating based on patients reviews? --> idx: 7
% III. How to visualize drug rating, kind of drugs, types of conditions a patient can have, sentiments based on reviews --> idx: 8
@RELATION WebMD-Drug-Reviews-Dataset --> idx: 9
 --> idx: 10
@ATTRIBUTE Age STRING --> idx: 11
@ATTRIBUTE Condition STRING --> idx: 12
@ATTRIBUTE Date STRING --> idx: 13
@ATTRIBUTE Drug STRING --> idx: 14
@ATTRIBUTE DrugId INTEGER --> idx: 15
@ATTRIBUTE EaseofUse INTEGER --

In [6]:
final_data = []
exper_data = []
columns = [
    'Age', 
    'Condition', 
    'Date', 
    'Drug', 
    'DrugId', 
    'EaseofUse', 
    'Effectiveness', 
    'Reviews', 
    'Satisfaction', 
    'Sex', 
    'Sides', 
    'UsefulCount'
]

def extract_sex(description):
    # Bersihkan spasi di awal dan akhir
    description = description.strip()
    # Cek apakah hanya berisi "male" atau "female"
    if re.fullmatch(r'male|female', description, flags=re.I):
        return description.lower()
    # Jika string kosong atau hanya spasi, kembalikan string kosong
    elif description == "":
        return np.NaN

# print(columns[-3:])
with open(DATASET_PATH, 'r') as file:
    # Skip lines until the '@DATA' line
    for line in file:
        if line.strip() == '@DATA':
            break
        
    # Read the actual data lines
    reader = csv.reader(file, delimiter=',', quotechar="'", doublequote=True)
    for idx, row in enumerate(reader):
        data_prep=row[:7] + [''.join(row[7:-4])] + row[-4:]
        sex_value = data_prep[-3].strip()

        # print(row[:7])

        if (re.fullmatch(r'male|female', sex_value, flags=re.I)) or (sex_value == ''):
            final_data.append(data_prep)
            # print(data_prep[-3:])
        else:
            exper_data.append(data_prep)
            # print(data_prep[-3:])
            # print()


In [7]:
df = pd.DataFrame(final_data, columns=columns)

In [8]:
df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,I\m a retired physician and of all the meds I ...,5,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,5,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,65-74,Other,7/16/2012,warfarin (bulk) 100 powder,144731,2,3,why did my PTINR go from a normal of 2.5 to ov...,3,Female,,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 powder,144731,2,2,FALLING AND DON\T REALISE IT',1,Female,,0
4,35-44,Other,1/6/2009,warfarin (bulk) 100 powder,144731,1,1,My grandfather was prescribed this medication ...,1,Male,,1


In [9]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362639 entries, 0 to 362638
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Age            362639 non-null  object
 1   Condition      362639 non-null  object
 2   Date           362639 non-null  object
 3   Drug           362639 non-null  object
 4   DrugId         362639 non-null  object
 5   EaseofUse      362639 non-null  object
 6   Effectiveness  362639 non-null  object
 7   Reviews        362639 non-null  object
 8   Satisfaction   362639 non-null  object
 9   Sex            362639 non-null  object
 10  Sides          362639 non-null  object
 11  UsefulCount    362639 non-null  object
dtypes: object(12)
memory usage: 399.5 MB


In [10]:
df.nunique()

Age                  12
Condition          1806
Date               4525
Drug               7096
DrugId             6573
EaseofUse            13
Effectiveness         7
Reviews          250042
Satisfaction          7
Sex                   3
Sides              1651
UsefulCount         148
dtype: int64

In [11]:
display(df['Age'].value_counts())
display(df['EaseofUse'].value_counts())
display(df['Effectiveness'].value_counts())
display(df['Satisfaction'].value_counts())
display(df['Sex'].value_counts())

Age
45-54         80007
55-64         75075
35-44         54982
25-34         49707
65-74         41201
19-24         24229
75 or over    15218
              12201
13-18          7040
7-12           1644
3-6             838
0-2             497
Name: count, dtype: int64

EaseofUse
5         192527
4          74708
3          41286
1          35906
2          18179
62261         18
154215         6
11147          3
6              2
10149          1
10             1
169930         1
57948          1
Name: count, dtype: int64

Effectiveness
5     130321
4      81785
3      60386
1      59354
2      30790
6          2
10         1
Name: count, dtype: int64

Satisfaction
5     111487
1     100856
4      63128
3      51837
2      35328
6          2
10         1
Name: count, dtype: int64

Sex
Female    238119
Male       98000
           26520
Name: count, dtype: int64

In [12]:
df.isna().sum()

Age              0
Condition        0
Date             0
Drug             0
DrugId           0
EaseofUse        0
Effectiveness    0
Reviews          0
Satisfaction     0
Sex              0
Sides            0
UsefulCount      0
dtype: int64

In [13]:
# df[['Condition', 'Date', 'Drug', 'DrugId', 'EaseofUse', 'Effectiveness']].sample(n=40)
df[['Drug']].sample(n=50)

Unnamed: 0,Drug
63455,nexplanon implant
262021,effexor xr
264420,gianvi
199713,terbinafine hcl
113725,prefera ob
46090,norvasc
126658,pepcid complete
312006,benicar
95247,metformin hcl
356799,celebrex


In [14]:
# df[df['Reviews'].str.match(r'^\d{1,}[a-z]')]

In [16]:
display(df['EaseofUse'].value_counts())

EaseofUse
5         192527
4          74708
3          41286
1          35906
2          18179
62261         18
154215         6
11147          3
6              2
10149          1
10             1
169930         1
57948          1
Name: count, dtype: int64

In [19]:
prep_df = df[df['EaseofUse'].isin(['62261', '154215', '11147', '10149', '169930', '57948'])]

In [20]:
prep_df[:10]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
127375,55-64,Treatment of Depletion of Body\s Supply of Sodium,Potassium and Chloride',4/25/2016,pedialyte oral solution,11147,3,"3After small bowel resection, I had diarrhea a...",1,,Mild nausea and vomiting may occur.,0
127377,45-54,Treatment of Depletion of Body\s Supply of Sodium,Potassium and Chloride',4/26/2014,pedialyte oral solution,11147,5,4Felt once I drank this liquid I felt more ene...,5,Female,Mild nausea and vomiting may occur.,1
127378,25-34,Treatment of Depletion of Body\s Supply of Sodium,Potassium and Chloride',4/5/2014,pedialyte oral solution,11147,3,2my 2 yr old son has been sick with diarhea an...,1,Female,Mild nausea and vomiting may occur.,1
174145,75 or over,Treatment of Depletion of Body\s Supply of Sodium,Potassium and Chloride',8/6/2009,yte,10149,4,"5I have low sodium and this has helped, altho ...",4,Female,Mild nausea and vomiting may occur.,1
307059,55-64,Plantar Warts,12/16/2016,dr scholl\s clear away adhesive patch,medicated',154215,1,1small discs did not stay in place - pads were...,1,Male,"Slight burning, skin redness, and peeling ma...",2
307060,25-34,Plantar Warts,3/6/2014,dr scholl\s clear away adhesive patch,medicated',154215,2,2I have used Dr. Scholl\s salicylic acid and t...,1,Female,"Slight burning, skin redness, and peeling ma...",3
307061,19-24,Corn,2/8/2013,dr scholl\s clear away adhesive patch,medicated',154215,5,4I\ve had a small hard corn on both of my pink...,3,Female,"Slight burning, skin redness, and peeling ma...",1
307062,19-24,Common Wart,1/29/2012,dr scholl\s clear away adhesive patch,medicated',154215,4,5I had two common warts on my hands and used D...,5,Female,"Slight burning, skin redness, and peeling ma...",0
307063,55-64,Plantar Warts,7/26/2011,dr scholl\s clear away adhesive patch,medicated',154215,2,1Medicated pads do not adhere well to skin and...,1,Male,"Slight burning, skin redness, and peeling ma...",2
307066,25-34,Common Wart,4/22/2011,dr scholl\s clear away adhesive patch,medicated',154215,3,2it will take the wart way for about 6 weeks t...,1,Female,"Slight burning, skin redness, and peeling ma...",1


In [21]:
prep_df.iloc[:4].loc[:, 'Condition'] = prep_df[:4]['Condition'] + '' +  prep_df[:4]['Date']
prep_df.iloc[:4].loc[:, 'Date'] = prep_df[:4]['Drug']
prep_df.iloc[:4].loc[:, 'Drug'] = prep_df[:4]['DrugId']
prep_df.iloc[:4].loc[:, 'DrugId'] = prep_df[:4]['EaseofUse']

In [22]:
prep_df[:15]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
127375,55-64,Treatment of Depletion of Body\s Supply of Sod...,4/25/2016,pedialyte oral solution,11147,11147,3,"3After small bowel resection, I had diarrhea a...",1,,Mild nausea and vomiting may occur.,0
127377,45-54,Treatment of Depletion of Body\s Supply of Sod...,4/26/2014,pedialyte oral solution,11147,11147,5,4Felt once I drank this liquid I felt more ene...,5,Female,Mild nausea and vomiting may occur.,1
127378,25-34,Treatment of Depletion of Body\s Supply of Sod...,4/5/2014,pedialyte oral solution,11147,11147,3,2my 2 yr old son has been sick with diarhea an...,1,Female,Mild nausea and vomiting may occur.,1
174145,75 or over,Treatment of Depletion of Body\s Supply of Sod...,8/6/2009,yte,10149,10149,4,"5I have low sodium and this has helped, altho ...",4,Female,Mild nausea and vomiting may occur.,1
307059,55-64,Plantar Warts,12/16/2016,dr scholl\s clear away adhesive patch,medicated',154215,1,1small discs did not stay in place - pads were...,1,Male,"Slight burning, skin redness, and peeling ma...",2
307060,25-34,Plantar Warts,3/6/2014,dr scholl\s clear away adhesive patch,medicated',154215,2,2I have used Dr. Scholl\s salicylic acid and t...,1,Female,"Slight burning, skin redness, and peeling ma...",3
307061,19-24,Corn,2/8/2013,dr scholl\s clear away adhesive patch,medicated',154215,5,4I\ve had a small hard corn on both of my pink...,3,Female,"Slight burning, skin redness, and peeling ma...",1
307062,19-24,Common Wart,1/29/2012,dr scholl\s clear away adhesive patch,medicated',154215,4,5I had two common warts on my hands and used D...,5,Female,"Slight burning, skin redness, and peeling ma...",0
307063,55-64,Plantar Warts,7/26/2011,dr scholl\s clear away adhesive patch,medicated',154215,2,1Medicated pads do not adhere well to skin and...,1,Male,"Slight burning, skin redness, and peeling ma...",2
307066,25-34,Common Wart,4/22/2011,dr scholl\s clear away adhesive patch,medicated',154215,3,2it will take the wart way for about 6 weeks t...,1,Female,"Slight burning, skin redness, and peeling ma...",1


In [23]:
Effectiveness = []
Reviews = []

for idx, value in enumerate(prep_df['Reviews']):
    match = re.match(r'(\d+)(.*)', value)
    # if idx >= 10:
    #     break
    # print(match.group(1), match.group(2))

    Effectiveness.append(match.group(1))
    Reviews.append(match.group(2))

In [24]:
Effectiveness[:5], Reviews[:5]

(['3', '4', '2', '5', '1'],
 ["After small bowel resection, I had diarrhea and nausea. My doctor recommended Pedialyte but the taste is so nasty I can\\t drink it. Why do you make it so sweet and fake? It coats my mouth with sugar and red dye. Doesn\\'t sound very therapeutic to me. Ugh'",
  'Felt once I drank this liquid I felt more energized, my body was functioning better, and felt like my body was filling up in a goodway, like my pores were happy to be receiving such vitamins and help',
  'my 2 yr old son has been sick with diarhea and did not want to continue drinking this product. ive insisted that he keep drinking and during dinner he vomitted ALL of the liquid that was in his stomach. i will not be giving this to him again. he has experienced many of the side effects that i think should be CLEARLY printed on the label of this product. i only learned about the side effects from this website. i do not recommend this product for toddlers the side effects are too severe. side effec

In [25]:
prep_df.iloc[4:].loc[:, 'Drug'] = prep_df.iloc[4:].loc[:, 'Drug'] + '' + prep_df.iloc[4:].loc[:, 'DrugId'] 
prep_df.iloc[4:].loc[:, 'DrugId'] = prep_df.iloc[4:].loc[:, 'EaseofUse']
prep_df.loc[:, 'EaseofUse'] = prep_df['Effectiveness']
prep_df.loc[:, 'Effectiveness'] = Effectiveness
prep_df.loc[:, 'Reviews'] = Reviews

In [26]:
prep_df[:15]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
127375,55-64,Treatment of Depletion of Body\s Supply of Sod...,4/25/2016,pedialyte oral solution,11147,3,3,"After small bowel resection, I had diarrhea an...",1,,Mild nausea and vomiting may occur.,0
127377,45-54,Treatment of Depletion of Body\s Supply of Sod...,4/26/2014,pedialyte oral solution,11147,5,4,Felt once I drank this liquid I felt more ener...,5,Female,Mild nausea and vomiting may occur.,1
127378,25-34,Treatment of Depletion of Body\s Supply of Sod...,4/5/2014,pedialyte oral solution,11147,3,2,my 2 yr old son has been sick with diarhea and...,1,Female,Mild nausea and vomiting may occur.,1
174145,75 or over,Treatment of Depletion of Body\s Supply of Sod...,8/6/2009,yte,10149,4,5,"I have low sodium and this has helped, altho m...",4,Female,Mild nausea and vomiting may occur.,1
307059,55-64,Plantar Warts,12/16/2016,dr scholl\s clear away adhesive patch medicated',154215,1,1,small discs did not stay in place - pads were ...,1,Male,"Slight burning, skin redness, and peeling ma...",2
307060,25-34,Plantar Warts,3/6/2014,dr scholl\s clear away adhesive patch medicated',154215,2,2,I have used Dr. Scholl\s salicylic acid and th...,1,Female,"Slight burning, skin redness, and peeling ma...",3
307061,19-24,Corn,2/8/2013,dr scholl\s clear away adhesive patch medicated',154215,5,4,I\ve had a small hard corn on both of my pinky...,3,Female,"Slight burning, skin redness, and peeling ma...",1
307062,19-24,Common Wart,1/29/2012,dr scholl\s clear away adhesive patch medicated',154215,4,5,I had two common warts on my hands and used Dr...,5,Female,"Slight burning, skin redness, and peeling ma...",0
307063,55-64,Plantar Warts,7/26/2011,dr scholl\s clear away adhesive patch medicated',154215,2,1,Medicated pads do not adhere well to skin and ...,1,Male,"Slight burning, skin redness, and peeling ma...",2
307066,25-34,Common Wart,4/22/2011,dr scholl\s clear away adhesive patch medicated',154215,3,2,it will take the wart way for about 6 weeks th...,1,Female,"Slight burning, skin redness, and peeling ma...",1


In [27]:
df.loc[prep_df.index] = prep_df

In [28]:
display(df['Age'].value_counts())
display(df['EaseofUse'].value_counts())
display(df['Effectiveness'].value_counts())
display(df['Satisfaction'].value_counts())
display(df['Sex'].value_counts())

Age
45-54         80007
55-64         75075
35-44         54982
25-34         49707
65-74         41201
19-24         24229
75 or over    15218
              12201
13-18          7040
7-12           1644
3-6             838
0-2             497
Name: count, dtype: int64

EaseofUse
5     192545
4      74712
3      41290
1      35908
2      18181
6          2
10         1
Name: count, dtype: int64

Effectiveness
5     130320
4      81785
3      60383
1      59357
2      30791
6          2
10         1
Name: count, dtype: int64

Satisfaction
5     111487
1     100856
4      63128
3      51837
2      35328
6          2
10         1
Name: count, dtype: int64

Sex
Female    238119
Male       98000
           26520
Name: count, dtype: int64

In [29]:
df.nunique()

Age                  12
Condition          1806
Date               4524
Drug               7093
DrugId             6572
EaseofUse             7
Effectiveness         7
Reviews          250041
Satisfaction          7
Sex                   3
Sides              1651
UsefulCount         148
dtype: int64

In [30]:
df.loc[362351]

Age                                                          35-44
Condition                                             Stop Smoking
Date                                                     3/31/2010
Drug                                                       chantix
DrugId                                                      144470
EaseofUse                                                        5
Effectiveness                                                    5
Reviews          25 years of smoking. Smoked the first week lik...
Satisfaction                                                     5
Sex                                                         Female
Sides            Nausea ,  headache ,  vomiting , drowsiness, g...
UsefulCount                                                     16
Name: 362351, dtype: object

In [31]:
cleaned_df = df.map(lambda x: np.NaN if re.search(r'^\s*$', x, flags=re.I) else x)

In [32]:
cleaned_df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,I\m a retired physician and of all the meds I ...,5,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,5,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,65-74,Other,7/16/2012,warfarin (bulk) 100 powder,144731,2,3,why did my PTINR go from a normal of 2.5 to ov...,3,Female,,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 powder,144731,2,2,FALLING AND DON\T REALISE IT',1,Female,,0
4,35-44,Other,1/6/2009,warfarin (bulk) 100 powder,144731,1,1,My grandfather was prescribed this medication ...,1,Male,,1


In [33]:
cleaned_df.shape

(362639, 12)

In [34]:
cleaned_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362639 entries, 0 to 362638
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Age            350438 non-null  object
 1   Condition      362596 non-null  object
 2   Date           362639 non-null  object
 3   Drug           362639 non-null  object
 4   DrugId         362639 non-null  object
 5   EaseofUse      362639 non-null  object
 6   Effectiveness  362639 non-null  object
 7   Reviews        320808 non-null  object
 8   Satisfaction   362639 non-null  object
 9   Sex            336119 non-null  object
 10  Sides          345184 non-null  object
 11  UsefulCount    362639 non-null  object
dtypes: object(12)
memory usage: 397.1 MB


In [35]:
cleaned_df.isna().sum()

Age              12201
Condition           43
Date                 0
Drug                 0
DrugId               0
EaseofUse            0
Effectiveness        0
Reviews          41831
Satisfaction         0
Sex              26520
Sides            17455
UsefulCount          0
dtype: int64

In [39]:
cleaned_df[cleaned_df['Sides'].isna()]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
2,65-74,Other,7/16/2012,warfarin (bulk) 100 powder,144731,2,3,why did my PTINR go from a normal of 2.5 to ov...,3,Female,,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 powder,144731,2,2,FALLING AND DON\T REALISE IT',1,Female,,0
4,35-44,Other,1/6/2009,warfarin (bulk) 100 powder,144731,1,1,My grandfather was prescribed this medication ...,1,Male,,1
5,55-64,Other,7/19/2008,warfarin (bulk) 100 powder,144731,4,4,help heart condition operation well,4,Male,,0
10,65-74,Other,3/15/2016,pyrogallol crystals,12112,5,5,Excellent in reducing inlamation associated wi...,5,Male,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
361408,25-34,Poisoning from Swallowed Unknown Substance,7/15/2010,"charcoal, activated",948,3,3,It\s helped me before.',3,Male,,0
361409,,Other,3/3/2010,"charcoal, activated",948,2,2,,2,,,0
361410,75 or over,Gas,2/12/2009,"charcoal, activated",948,3,1,It leaves a bad metalic taste in your mouth th...,1,Male,,0
361411,45-54,Poisoning from Swallowed Unknown Substance,11/5/2007,"charcoal, activated",948,5,5,,5,Male,,3


In [41]:
cleaned_df[cleaned_df['Sex'].isna()]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
11,19-24,Birth Control,11/17/2018,lyza,164750,5,5,Taking Lyza made me break out HORRIBLY. I trie...,2,,"Nausea , vomiting , headache , bloating , ...",0
14,35-44,Birth Control,2/13/2018,lyza,164750,5,5,I was concerned about starting a bc pill due t...,5,,"Nausea , vomiting , headache , bloating , ...",0
15,25-34,Birth Control,12/9/2017,lyza,164750,2,2,The birth control was very easy to use and it ...,1,,"Nausea , vomiting , headache , bloating , ...",1
17,25-34,Birth Control,9/27/2017,lyza,164750,4,4,I??ve been taking Lyza for years now and I abs...,1,,"Nausea , vomiting , headache , bloating , ...",1
20,35-44,Birth Control,5/6/2016,lyza,164750,5,5,I took this pill for a month. It was so awful ...,1,,"Nausea , vomiting , headache , bloating , ...",3
...,...,...,...,...,...,...,...,...,...,...,...,...
362404,,Stop Smoking,1/19/2010,chantix,144470,5,5,,3,,"Nausea , headache , vomiting , drowsiness, g...",0
362478,,Stop Smoking,2/26/2009,chantix,144470,5,3,I have just started to take this medication. T...,3,,"Nausea , headache , vomiting , drowsiness, g...",9
362506,45-54,Stop Smoking,7/27/2009,chantix,144470,5,5,Was on Chantix for 3 months beginning Feb 07. ...,5,,"Nausea , headache , vomiting , drowsiness, g...",9
362633,55-64,Stop Smoking,11/14/2008,chantix,144470,5,5,"I goofed,it was june 16th of 2007 that I had m...",5,,"Nausea , headache , vomiting , drowsiness, g...",0


In [137]:
cleaned_df.nunique()

Age                  11
Condition          1805
Date               4524
Drug               7093
DrugId             6572
EaseofUse             7
Effectiveness         7
Reviews          250038
Satisfaction          7
Sex                   2
Sides              1650
UsefulCount         148
dtype: int64

In [141]:
display(cleaned_df['Age'].value_counts())
display(cleaned_df['EaseofUse'].value_counts())
display(cleaned_df['Effectiveness'].value_counts())
display(cleaned_df['Satisfaction'].value_counts())
display(cleaned_df['Sex'].value_counts())

Age
45-54         80007
55-64         75075
35-44         54982
25-34         49707
65-74         41201
19-24         24229
75 or over    15218
13-18          7040
7-12           1644
3-6             838
0-2             497
Name: count, dtype: int64

EaseofUse
5     192545
4      74712
3      41290
1      35908
2      18181
6          2
10         1
Name: count, dtype: int64

Effectiveness
5     130320
4      81785
3      60383
1      59357
2      30791
6          2
10         1
Name: count, dtype: int64

Satisfaction
5     111487
1     100856
4      63128
3      51837
2      35328
6          2
10         1
Name: count, dtype: int64

Sex
Female    238119
Male       98000
Name: count, dtype: int64

In [97]:
cleaned_df['Condition'].value_counts()

Condition
Other                                                                             47648
Pain                                                                              23624
High Blood Pressure                                                               21627
Depression                                                                        13807
Birth Control                                                                     11192
                                                                                  ...  
Hospital-Acquired Pseudomonas Aeruginosa Pneumonia Treated with Multiple Drugs        1
Bacterial Blood Infection caused by Pseudomonas Aeruginosa                            1
Defect of Connective Tissue - Noonan\s Syndrome'                                      1
Infection of Female Pelvic Organs caused by Klebsiella                                1
Complicated Skin Infection due to Peptostreptococcus Bacteria                         1
Name: count, Length: 1

In [181]:
hipertension_condition = cleaned_df[cleaned_df['Condition'] == 'Depression'][['Age', 'Drug', 'Condition', 'Sides']]
hipertension_condition[['Age', 'Drug']].value_counts().unstack().T.replace(np.nan, 0).sum(axis=1).idxmax() # 'lexapro'
# hipertension_condition[hipertension_condition['Drug'] == 'lexapro']['Age'].value_counts()

'lexapro'

In [199]:
exper_df = pd.DataFrame(exper_data, columns=columns)

In [200]:
exper_df.shape

(167, 12)

In [201]:
exper_df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,45-54,"Disorder characterized by Stiff, Tender Painf...",2/18/2010,lyrica,93965,5,5,I hope I never stop taking this med. It takes ...,constipation,difficulty concentrating,or weight gain may occur.',1
1,55-64,Neuropathic Pain,3/12/2008,lyrica,93965,5,5,I have compressed nerve in my lower back due t...,constipation,difficulty concentrating,or weight gain may occur.',21
2,65-74,High Amount of Triglyceride in the Blood,12/8/2008,lovaza,148529,5,2,2 years ago I took lovaza and my colestrol wen...,",2,Female,Upset stomach",burping,and strange taste in mouth may occur.',6
3,55-64,High Blood Pressure,3/16/2009,lisinopril solution,6873,5,3,The first couple of months Lisinopril worked f...,lightheadedness,tiredness,or headache may occur as your body adjusts ...,12
4,55-64,High Blood Pressure,2/21/2010,lisinopril solution,6873,4,4,Really don\t know if the issues I have is a si...,lightheadedness,tiredness,or headache may occur as your body adjusts ...,0


In [202]:
exper_df.nunique()

Age                9
Condition         60
Date             120
Drug             136
DrugId           103
EaseofUse          5
Effectiveness      5
Reviews          120
Satisfaction      67
Sex               63
Sides             88
UsefulCount       27
dtype: int64

In [203]:
display(exper_df['Satisfaction'].value_counts())
display(exper_df['Sex'].value_counts())
display(exper_df['Sides'].value_counts())

Satisfaction
  lightheadedness                          27
  constipation                             12
5                                          11
  dizziness                                10
  breast  tenderness                        8
                                           ..
  muscle pain                               1
4                                           1
 it causes my blood sugar to bottom out     1
  heartburn                                 1
2                                           1
Name: count, Length: 67, dtype: int64

Sex
  dizziness                                                                                                                                        24
  headache                                                                                                                                         14
 drowsiness                                                                                                                                         8
 tiredness                                                                                                                                          7
  weight  gain                                                                                                                                      6
                                                                                                                                                   ..
 I have been on Theragran more than six months and it is the longest I\ve gone without a urinary

Sides
 or drowsiness may occur.'                                              19
 or  headache  may occur.'                                               6
 or  weight  changes may occur.'                                         5
 or  trouble sleeping  may occur.'                                       5
 or change in  sex drive /ability may occur.'                            4
                                                                        ..
,5,Female,Dizziness  may occur.'                                         1
,4,Male,Dizziness  may occur.'                                           1
,5,Female, '                                                             1
 or  dizziness  during and after placement of the device may occur.'     1
 or changes in taste may occur.'                                         1
Name: count, Length: 88, dtype: int64

In [83]:
pattern = r'^(.+?),(\d+),(\w+),(.+)$'
test_data = []
for value in exper_df.loc[:, ['Reviews', 'Satisfaction', 'Sex', 'Sides']].values:
    new_value = ''.join(value)
    match = re.match(pattern, new_value)
    # print(new_value)
    if match:
        test_data.append(match.groups())
        # print(list(match.groups()))
    # else:
        # test_data.append(new_value)
        # print(new_value)

NameError: name 'exper_df' is not defined

In [240]:
len(test_data)

150

In [241]:
for value in test_data[:4]:
    print(value)

("I hope I never stop taking this med. It takes a way the crippling stiff muscles, with out it I can\\t roll over in bed without being in sever pain my skin doesn\\'t feel raw.I can get a hug without cringing from pain. I stoped taking it because I thought it wasn\\'t helping. I was quick to take it again to me it has been a god send. I have NO side effects from it. I just can\\'t tell you how much it has helped me with Fibral mialgie.The cramps that your body goes through is crushateing. Mucles get hard as rocks  gets huge bumps. I hope this has helped some one. It takes a little time to work. But it does work ", '5', 'Female', "Drowsiness  dizziness   dry mouth   constipation  difficulty concentrating or  weight  gain may occur.'")
('I have compressed nerve in my lower back due to a fall and I was having a hard time functioning and one morning I got up and the pain was so bad I could not move. I was really scared. I went to my family dr. and she presribed this and only 1 pill and I f

In [205]:
exper_df.columns

Index(['Age', 'Condition', 'Date', 'Drug', 'DrugId', 'EaseofUse',
       'Effectiveness', 'Reviews', 'Satisfaction', 'Sex', 'Sides',
       'UsefulCount'],
      dtype='object')

In [216]:
fix_exper_df = exper_df[['Age', 'Condition', 'Date', 'Drug', 'DrugId', 'EaseofUse', 'Effectiveness', 'UsefulCount']]

In [217]:
fix_exper_df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,UsefulCount
0,45-54,"Disorder characterized by Stiff, Tender Painf...",2/18/2010,lyrica,93965,5,5,1
1,55-64,Neuropathic Pain,3/12/2008,lyrica,93965,5,5,21
2,65-74,High Amount of Triglyceride in the Blood,12/8/2008,lovaza,148529,5,2,6
3,55-64,High Blood Pressure,3/16/2009,lisinopril solution,6873,5,3,12
4,55-64,High Blood Pressure,2/21/2010,lisinopril solution,6873,4,4,0


In [218]:
len(test_data[0])

4

In [219]:
fix_test_df = pd.DataFrame(test_data, columns=['Reviews', 'Satisfaction', 'Sex', 'Sides'])

In [225]:
fix_test_df[:5]

Unnamed: 0,Reviews,Satisfaction,Sex,Sides
0,I hope I never stop taking this med. It takes ...,5,Female,Drowsiness dizziness dry mouth constipati...
1,I have compressed nerve in my lower back due t...,5,Female,Drowsiness dizziness dry mouth constipati...
2,2 years ago I took lovaza and my colestrol wen...,2,Female,Upset stomach burping and strange taste in m...
3,The first couple of months Lisinopril worked f...,1,Female,Dizziness lightheadedness tiredness or hea...
4,Really don\t know if the issues I have is a si...,4,Male,Dizziness lightheadedness tiredness or hea...


In [235]:
for value_row in fix_test_df['Reviews']:
    print(value_row)

I hope I never stop taking this med. It takes a way the crippling stiff muscles, with out it I can\t roll over in bed without being in sever pain my skin doesn\'t feel raw.I can get a hug without cringing from pain. I stoped taking it because I thought it wasn\'t helping. I was quick to take it again to me it has been a god send. I have NO side effects from it. I just can\'t tell you how much it has helped me with Fibral mialgie.The cramps that your body goes through is crushateing. Mucles get hard as rocks  gets huge bumps. I hope this has helped some one. It takes a little time to work. But it does work 
I have compressed nerve in my lower back due to a fall and I was having a hard time functioning and one morning I got up and the pain was so bad I could not move. I was really scared. I went to my family dr. and she presribed this and only 1 pill and I felt such relief. I have been on this for about 1 and half years and probsbly will be on it for the rest of my life.I don\t want surg