In [1]:
import pandas as pd
import numpy as np
import csv
import re

In [3]:
DATASET_PATH = './Assets/Datasets/dataset_'

In [4]:
with open(DATASET_PATH, 'r') as file:
    for idx, line in enumerate(file):
        line = line.strip()
        print(f'{line} --> idx: {idx}')
        if idx == 24:
            break

    reader = csv.reader(file, delimiter=',', quotechar="'", doublequote=True)
    for idx, row in enumerate(reader):
        if idx >= 10:
            break

        print(f'length of data: {len(row)} --> value list: {row}')

% Context --> idx: 0
% The dataset provides user reviews on specific drugs along with related conditions, side effects, age, sex, and ratings reflecting overall patient satisfaction. --> idx: 1
% Content --> idx: 2
% Data was acquired by scraping WebMD site. There are around 0.36 million rows of unique reviews and is updated till Mar 2020. --> idx: 3
% Inspiration --> idx: 4
% This dataset intended to answer following questions: --> idx: 5
% I. Identifying the condition of the patient based on drug reviews? --> idx: 6
% II. How to predict drug rating based on patients reviews? --> idx: 7
% III. How to visualize drug rating, kind of drugs, types of conditions a patient can have, sentiments based on reviews --> idx: 8
@RELATION WebMD-Drug-Reviews-Dataset --> idx: 9
 --> idx: 10
@ATTRIBUTE Age STRING --> idx: 11
@ATTRIBUTE Condition STRING --> idx: 12
@ATTRIBUTE Date STRING --> idx: 13
@ATTRIBUTE Drug STRING --> idx: 14
@ATTRIBUTE DrugId INTEGER --> idx: 15
@ATTRIBUTE EaseofUse INTEGER --

In [15]:
columns = [
    'Age',
    'Condition',
    'Date',
    'Drug',
    'DrugId',
    'EaseofUse',
    'Effectiveness',
    'Reviews',
    'Satisfaction',
    'Sex',
    'Sides',
    'UsefulCount',
]

In [5]:
messy_data = []
true_data = []
with open(DATASET_PATH, 'r') as file:
    for idx, line in enumerate(file):
        if line.strip() == '@DATA':
            break

    reader = csv.reader(file, delimiter=',', quotechar="'", doublequote=True)
    for idx, row in enumerate(reader):
        if len(row) != 12: # the number of attributes is 12, so if the length of the list is not equal to 12, then something is wrong.
            messy_data.append(row)
        else:
            true_data.append(row)

In [6]:
print(f'Number of True data: {len(true_data)}')
print(f'Number of Messy data: {len(messy_data)}')

Number of True data: 297967
Number of Messy data: 64839


In [7]:
for value in true_data[:5]:
    print(f'length of data: {len(value)} --> value list: {value}')

length of data: 12 --> value list: ['75 or over', 'Stuffy Nose', '9/21/2014', '25dph-7.5peh', '146724', '5', '5', "I\\m a retired physician and of all the meds I have tried for my allergies (seasonal and not) - this one is the most effective for me.  When I first began using this drug some years ago - tiredness as a problem but is not currently.'", '5', 'Male', 'Drowsiness,  dizziness ,  dry mouth /nose/throat,  headache ,  upset stomach ,  constipation , or  trouble sleeping  may occur.', '0']
length of data: 12 --> value list: ['25-34', 'Cold Symptoms', '1/13/2011', '25dph-7.5peh', '146724', '5', '5', 'cleared me right up even with my throat hurting it went away after taking the medicine', '5', 'Female', 'Drowsiness,  dizziness ,  dry mouth /nose/throat,  headache ,  upset stomach ,  constipation , or  trouble sleeping  may occur.', '1']
length of data: 12 --> value list: ['65-74', 'Other', '7/16/2012', 'warfarin (bulk) 100  powder', '144731', '2', '3', 'why did my PTINR go from a no

In [8]:
for value in messy_data[:5]:
    print(f'length of data: {len(value)} --> value list: {value}')

length of data: 13 --> value list: ['25-34', 'Birth Control', '6/15/2017', 'wymzya fe', '163180', '5', '5', "Haven\\t gotten pregnant so it does it\\'s job. I was switched to this brand from another generic. I get nauseous and generally feel tired on this", " also more headaches.'", '2', 'Female', 'Nausea ,  vomiting ,  headache ,  bloating ,  breast  tenderness, swelling of the  ankles /feet (fluid retention), or  weight  change may occur.', '0']
length of data: 17 --> value list: ['25-34', 'Birth Control', '10/7/2017', 'lyza', '164750', '1', '1', "LYZA BIRTH CONTROL\\nThese are the WORST birth control pills I have ever taken!  I\\ve only been taking them for a couple of weeks and I feel like a crazy person.  I\\'ve been crying", ' (for no reason)', ' been super depressed', ' my anxiety is terrible', " along with daily headaches and break outs.   I realize everyone\\'s body reacts differently to medication", "  but I\\'ve read nothing but negative reviews.  I can surely attest to that

In [9]:
for value in messy_data[:5]:
    fix_length_data = value[:7] + [''.join(value[7:-4])] + value[-4:]
    print(f'length of data: {len(value)} --> value list: {value}')
    print(f'length of fix data: {len(fix_length_data)} --> value list: {fix_length_data}')
    print()

length of data: 13 --> value list: ['25-34', 'Birth Control', '6/15/2017', 'wymzya fe', '163180', '5', '5', "Haven\\t gotten pregnant so it does it\\'s job. I was switched to this brand from another generic. I get nauseous and generally feel tired on this", " also more headaches.'", '2', 'Female', 'Nausea ,  vomiting ,  headache ,  bloating ,  breast  tenderness, swelling of the  ankles /feet (fluid retention), or  weight  change may occur.', '0']
length of fix data: 12 --> value list: ['25-34', 'Birth Control', '6/15/2017', 'wymzya fe', '163180', '5', '5', "Haven\\t gotten pregnant so it does it\\'s job. I was switched to this brand from another generic. I get nauseous and generally feel tired on this also more headaches.'", '2', 'Female', 'Nausea ,  vomiting ,  headache ,  bloating ,  breast  tenderness, swelling of the  ankles /feet (fluid retention), or  weight  change may occur.', '0']

length of data: 17 --> value list: ['25-34', 'Birth Control', '10/7/2017', 'lyza', '164750', '1

In [10]:
fix_data = []
for value in messy_data:
    fix_length_data = value[:7] + [''.join(value[7:-4])] + value[-4:]
    fix_data.append(fix_length_data)

In [19]:
fix_df = pd.DataFrame(fix_data, columns=columns)
true_df = pd.DataFrame(true_data, columns=columns)

In [22]:
print(f'shape of true_df: {true_df.shape}, shape of fix_df: {fix_df.shape}')

shape of true_df: (297967, 12), shape of fix_df: (64839, 12)


In [25]:
true_df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,I\m a retired physician and of all the meds I ...,5,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,5,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,65-74,Other,7/16/2012,warfarin (bulk) 100 powder,144731,2,3,why did my PTINR go from a normal of 2.5 to ov...,3,Female,,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 powder,144731,2,2,FALLING AND DON\T REALISE IT',1,Female,,0
4,35-44,Other,1/6/2009,warfarin (bulk) 100 powder,144731,1,1,My grandfather was prescribed this medication ...,1,Male,,1


In [23]:
true_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297967 entries, 0 to 297966
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Age            297967 non-null  object
 1   Condition      297967 non-null  object
 2   Date           297967 non-null  object
 3   Drug           297967 non-null  object
 4   DrugId         297967 non-null  object
 5   EaseofUse      297967 non-null  object
 6   Effectiveness  297967 non-null  object
 7   Reviews        297967 non-null  object
 8   Satisfaction   297967 non-null  object
 9   Sex            297967 non-null  object
 10  Sides          297967 non-null  object
 11  UsefulCount    297967 non-null  object
dtypes: object(12)
memory usage: 306.8 MB


In [26]:
fix_df[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,25-34,Birth Control,6/15/2017,wymzya fe,163180,5,5,Haven\t gotten pregnant so it does it\'s job. ...,2,Female,"Nausea , vomiting , headache , bloating , ...",0
1,25-34,Birth Control,10/7/2017,lyza,164750,1,1,LYZA BIRTH CONTROL\nThese are the WORST birth ...,1,Female,"Nausea , vomiting , headache , bloating , ...",1
2,25-34,Birth Control,1/30/2017,lyza,164750,5,5,I have been taking Lyza for two months now. I ...,4,Female,"Nausea , vomiting , headache , bloating , ...",1
3,35-44,Birth Control,5/6/2016,lyza,164750,5,5,I took this pill for a month. It was so awful ...,1,,"Nausea , vomiting , headache , bloating , ...",3
4,25-34,Birth Control,4/29/2016,lyza,164750,5,5,My OB/GYN placed me on this pill because I was...,4,Female,"Nausea , vomiting , headache , bloating , ...",10


In [24]:
fix_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64839 entries, 0 to 64838
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            64839 non-null  object
 1   Condition      64839 non-null  object
 2   Date           64839 non-null  object
 3   Drug           64839 non-null  object
 4   DrugId         64839 non-null  object
 5   EaseofUse      64839 non-null  object
 6   Effectiveness  64839 non-null  object
 7   Reviews        64839 non-null  object
 8   Satisfaction   64839 non-null  object
 9   Sex            64839 non-null  object
 10  Sides          64839 non-null  object
 11  UsefulCount    64839 non-null  object
dtypes: object(12)
memory usage: 92.9 MB


In [354]:
concat_df = pd.concat([true_df, fix_df], axis=0, ignore_index=True)

In [355]:
concat_df[:15]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,I\m a retired physician and of all the meds I ...,5,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,5,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,65-74,Other,7/16/2012,warfarin (bulk) 100 powder,144731,2,3,why did my PTINR go from a normal of 2.5 to ov...,3,Female,,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 powder,144731,2,2,FALLING AND DON\T REALISE IT',1,Female,,0
4,35-44,Other,1/6/2009,warfarin (bulk) 100 powder,144731,1,1,My grandfather was prescribed this medication ...,1,Male,,1
5,55-64,Other,7/19/2008,warfarin (bulk) 100 powder,144731,4,4,help heart condition operation well,4,Male,,0
6,45-54,Disease of Ovaries with Cysts,1/30/2017,wymzya fe,163180,5,5,I have take this for 5 years age 45-50 to prev...,5,Female,"Nausea , vomiting , headache , bloating , ...",0
7,25-34,Acne,4/27/2016,wymzya fe,163180,4,2,,2,Female,"Nausea , vomiting , headache , bloating , ...",1
8,55-64,Stuffy Nose,10/29/2012,"12 hour nasal relief spray, non-aerosol",9800,4,2,The 12 hour spray only works for me for 6 hours.,2,Male,"Temporary burning, stinging, dryness in the no...",0
9,65-74,Other,3/15/2016,pyrogallol crystals,12112,5,5,Excellent in reducing inlamation associated wi...,5,Male,,0


In [293]:
print(f'shape of concat_df: {concat_df.shape}')

shape of concat_df: (362806, 12)


In [294]:
concat_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362806 entries, 0 to 362805
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Age            362806 non-null  object
 1   Condition      362806 non-null  object
 2   Date           362806 non-null  object
 3   Drug           362806 non-null  object
 4   DrugId         362806 non-null  object
 5   EaseofUse      362806 non-null  object
 6   Effectiveness  362806 non-null  object
 7   Reviews        362806 non-null  object
 8   Satisfaction   362806 non-null  object
 9   Sex            362806 non-null  object
 10  Sides          362806 non-null  object
 11  UsefulCount    362806 non-null  object
dtypes: object(12)
memory usage: 399.7 MB


In [295]:
concat_df.nunique()

Age                  12
Condition          1806
Date               4525
Drug               7096
DrugId             6573
EaseofUse            13
Effectiveness         7
Reviews          250162
Satisfaction         70
Sex                  66
Sides              1726
UsefulCount         148
dtype: int64

In [296]:
concat_df['Sex'].value_counts()

Sex
Female                                238119
Male                                   98000
                                       26520
  dizziness                               24
  headache                                14
                                       ...  
,3, '                                      1
 irritability                              1
 bleeding                                  1
 peeling/flaking/scabbing/crusting         1
 unusual  dreams                           1
Name: count, Length: 66, dtype: int64

In [297]:
concat_df['Satisfaction'].value_counts()

Satisfaction
5                                                                                                                                                                                                     111498
1                                                                                                                                                                                                     100859
4                                                                                                                                                                                                      63129
3                                                                                                                                                                                                      51837
2                                                                                                                                                                      

In [298]:
concat_df['Effectiveness'].value_counts()

Effectiveness
5     130389
4      81821
3      60409
1      59384
2      30800
6          2
10         1
Name: count, dtype: int64

In [299]:
concat_df['EaseofUse'].value_counts()

EaseofUse
5         192632
4          74728
3          41299
1          35925
2          18189
62261         18
154215         6
11147          3
6              2
10             1
10149          1
169930         1
57948          1
Name: count, dtype: int64

In [300]:
concat_df['Age'].value_counts()

Age
45-54         80043
55-64         75136
35-44         55011
25-34         49718
65-74         41216
19-24         24230
75 or over    15226
              12202
13-18          7045
7-12           1644
3-6             838
0-2             497
Name: count, dtype: int64

kanidat kolom yang bermasalah: 

| name kolom    | total number of unique      |
| ------------- | ------------- |
| EaseofUse     | 13            |
| Satisfaction  | 70            |
| Sex           | 66            |

In [356]:
invalid_easeofuse_df = concat_df[concat_df['EaseofUse'].isin(['62261', '154215', '11147', '10149', '169930', '57948'])]

In [357]:
invalid_easeofuse_df.shape

(30, 12)

In [361]:
invalid_easeofuse_df[:10]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
320440,55-64,Treatment of Depletion of Body\s Supply of Sodium,Potassium and Chloride',4/25/2016,pedialyte oral solution,11147,3,"3After small bowel resection, I had diarrhea a...",1,,Mild nausea and vomiting may occur.,0
320441,45-54,Treatment of Depletion of Body\s Supply of Sodium,Potassium and Chloride',4/26/2014,pedialyte oral solution,11147,5,4Felt once I drank this liquid I felt more ene...,5,Female,Mild nausea and vomiting may occur.,1
320442,25-34,Treatment of Depletion of Body\s Supply of Sodium,Potassium and Chloride',4/5/2014,pedialyte oral solution,11147,3,2my 2 yr old son has been sick with diarhea an...,1,Female,Mild nausea and vomiting may occur.,1
330057,75 or over,Treatment of Depletion of Body\s Supply of Sodium,Potassium and Chloride',8/6/2009,yte,10149,4,"5I have low sodium and this has helped, altho ...",4,Female,Mild nausea and vomiting may occur.,1
353216,55-64,Plantar Warts,12/16/2016,dr scholl\s clear away adhesive patch,medicated',154215,1,1small discs did not stay in place - pads were...,1,Male,"Slight burning, skin redness, and peeling ma...",2
353217,25-34,Plantar Warts,3/6/2014,dr scholl\s clear away adhesive patch,medicated',154215,2,2I have used Dr. Scholl\s salicylic acid and t...,1,Female,"Slight burning, skin redness, and peeling ma...",3
353218,19-24,Corn,2/8/2013,dr scholl\s clear away adhesive patch,medicated',154215,5,4I\ve had a small hard corn on both of my pink...,3,Female,"Slight burning, skin redness, and peeling ma...",1
353219,19-24,Common Wart,1/29/2012,dr scholl\s clear away adhesive patch,medicated',154215,4,5I had two common warts on my hands and used D...,5,Female,"Slight burning, skin redness, and peeling ma...",0
353220,55-64,Plantar Warts,7/26/2011,dr scholl\s clear away adhesive patch,medicated',154215,2,1Medicated pads do not adhere well to skin and...,1,Male,"Slight burning, skin redness, and peeling ma...",2
353223,25-34,Common Wart,4/22/2011,dr scholl\s clear away adhesive patch,medicated',154215,3,2it will take the wart way for about 6 weeks t...,1,Female,"Slight burning, skin redness, and peeling ma...",1


In [358]:
invalid_easeofuse_df_copy = invalid_easeofuse_df.copy(deep=True)
invalid_easeofuse_df_copy.iloc[:4].loc[:, 'Condition'] = invalid_easeofuse_df_copy[:4]['Condition'] + invalid_easeofuse_df_copy[:4]['Date']
invalid_easeofuse_df_copy.iloc[:4].loc[:, 'Date'] = invalid_easeofuse_df_copy[:4]['Drug']
invalid_easeofuse_df_copy.iloc[:4].loc[:, 'Drug'] = invalid_easeofuse_df_copy[:4]['DrugId']
invalid_easeofuse_df_copy.iloc[:4].loc[:, 'DrugId'] = invalid_easeofuse_df_copy[:4]['EaseofUse']

invalid_easeofuse_df_copy.iloc[4:].loc[:, 'Drug'] = invalid_easeofuse_df_copy.iloc[4:].loc[:, 'Drug'] + '' + invalid_easeofuse_df_copy.iloc[4:].loc[:, 'DrugId'] 
invalid_easeofuse_df_copy.iloc[4:].loc[:, 'DrugId'] = invalid_easeofuse_df_copy.iloc[4:].loc[:, 'EaseofUse']
invalid_easeofuse_df_copy['EaseofUse'] = invalid_easeofuse_df_copy['Effectiveness']
invalid_easeofuse_df_copy[['Effectiveness', 'Reviews']] = invalid_easeofuse_df_copy['Reviews'].str.extract(r'(\d+)(.*)', expand=True)

In [365]:
invalid_easeofuse_df_copy[:10]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
320440,55-64,Treatment of Depletion of Body\s Supply of Sod...,4/25/2016,pedialyte oral solution,11147,3,3,"After small bowel resection, I had diarrhea an...",1,,Mild nausea and vomiting may occur.,0
320441,45-54,Treatment of Depletion of Body\s Supply of Sod...,4/26/2014,pedialyte oral solution,11147,5,4,Felt once I drank this liquid I felt more ener...,5,Female,Mild nausea and vomiting may occur.,1
320442,25-34,Treatment of Depletion of Body\s Supply of Sod...,4/5/2014,pedialyte oral solution,11147,3,2,my 2 yr old son has been sick with diarhea and...,1,Female,Mild nausea and vomiting may occur.,1
330057,75 or over,Treatment of Depletion of Body\s Supply of Sod...,8/6/2009,yte,10149,4,5,"I have low sodium and this has helped, altho m...",4,Female,Mild nausea and vomiting may occur.,1
353216,55-64,Plantar Warts,12/16/2016,dr scholl\s clear away adhesive patch medicated',154215,1,1,small discs did not stay in place - pads were ...,1,Male,"Slight burning, skin redness, and peeling ma...",2
353217,25-34,Plantar Warts,3/6/2014,dr scholl\s clear away adhesive patch medicated',154215,2,2,I have used Dr. Scholl\s salicylic acid and th...,1,Female,"Slight burning, skin redness, and peeling ma...",3
353218,19-24,Corn,2/8/2013,dr scholl\s clear away adhesive patch medicated',154215,5,4,I\ve had a small hard corn on both of my pinky...,3,Female,"Slight burning, skin redness, and peeling ma...",1
353219,19-24,Common Wart,1/29/2012,dr scholl\s clear away adhesive patch medicated',154215,4,5,I had two common warts on my hands and used Dr...,5,Female,"Slight burning, skin redness, and peeling ma...",0
353220,55-64,Plantar Warts,7/26/2011,dr scholl\s clear away adhesive patch medicated',154215,2,1,Medicated pads do not adhere well to skin and ...,1,Male,"Slight burning, skin redness, and peeling ma...",2
353223,25-34,Common Wart,4/22/2011,dr scholl\s clear away adhesive patch medicated',154215,3,2,it will take the wart way for about 6 weeks th...,1,Female,"Slight burning, skin redness, and peeling ma...",1


In [366]:
concat_df.loc[invalid_easeofuse_df_copy.index] = invalid_easeofuse_df_copy

In [309]:
concat_df['EaseofUse'].value_counts()

EaseofUse
5     192650
4      74732
3      41303
1      35927
2      18191
6          2
10         1
Name: count, dtype: int64

In [310]:
concat_df['Sex'].value_counts()

Sex
Female                                238119
Male                                   98000
                                       26520
  dizziness                               24
  headache                                14
                                       ...  
,3, '                                      1
 irritability                              1
 bleeding                                  1
 peeling/flaking/scabbing/crusting         1
 unusual  dreams                           1
Name: count, Length: 66, dtype: int64

In [311]:
invalid_sex = concat_df[~concat_df['Sex'].str.contains(r'^male$|^female$|^\s*$', case=False)]

In [312]:
invalid_sex.shape

(167, 12)

In [313]:
invalid_sex[:5]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
31058,65-74,Inability to have an Erection,7/29/2009,levitra,76765,5,4,I find that on ehalf hour is when it starts to...,flushing,stuffy/ runny nose,or dizziness may occur.',11
51324,45-54,Allergic Rhinitis Prevention,5/1/2014,"nasonex aerosol, spray with pump",4820,5,5,This drug literally changed my life. Before I...,",5,Female,Nose/throat dryness or irritation",blood -tinged mucus /phlegm,and nosebleeds may occur.',14
163162,45-54,Multiple Sclerosis Symptoms Return then Become...,10/31/2013,tecfidera,163868,5,5,I have been taking the pill for about 3 months...,redness,itching,and burning feeling of the skin may occur.',14
215722,65-74,Type 2 Diabetes Mellitus,4/12/2014,glimepiride,12271,5,2,I can\t take it at night,it causes my blood sugar to bottom out,",2, '",Nausea and upset stomach may occur.,17
219503,35-44,Neuropathic Pain,9/29/2009,"gabapentin tablet, extended release 24 hr",14208,5,3,just started. I think I have felt a calming of...,",5,Female,Drowsiness",loss of coordination,and dizziness may occur.',20


In [314]:
columns_invalid_sex = ['Reviews', 'Satisfaction', 'Sex', 'Sides']
joined_strings = invalid_sex[columns_invalid_sex].apply(lambda row: ''.join(row.astype(str)), axis=1)

In [315]:
joined_strings.shape

(167,)

In [316]:
joined_strings[:5]

31058     I find that on ehalf hour is when it starts to...
51324     This drug literally changed my life.  Before I...
163162    I have been taking the pill for about 3 months...
215722    I can\t take it at night it causes my blood su...
219503    just started. I think I have felt a calming of...
dtype: object

In [317]:
for sample, value in enumerate(joined_strings):
    if sample >= 10:
        break
    print(value)

I find that on ehalf hour is when it starts to work for me and my erection is fairly solid and it stay that waay untill orgasm.  Also with in reasonable space of time I can have another erection though softer but works well.  I f not the night before the morning after when my erections are like my old pee erections but dont go away after peeing in I engage in sexualy activity\',4,Male,Headache  flushing stuffy/ runny nose  or  dizziness  may occur.'
This drug literally changed my life.  Before I started taking it, I had 2-3 sinus infections/yr - I was always tired - always had sinus pressure.  A couple of times it progressed to pneumonia.  Now - I haven\t been sick in 2 yrs.  I have an occasional nose bleed but then I just make sure I\'m getting enough moisture in the air and it stops.  I use it year round,5,Female,Nose/throat dryness or irritation  blood -tinged  mucus /phlegm and  nosebleeds  may occur.'
I have been taking the pill for about 3 months and is very happy with it\',5,Fem

In [318]:
invalid_sex_fixed = joined_strings.str.extract(r'^(.+?),(\d+),(\w+),(.+)$')

In [319]:
invalid_sex_fixed[:15]

Unnamed: 0,0,1,2,3
31058,I find that on ehalf hour is when it starts to...,4.0,Male,Headache flushing stuffy/ runny nose or diz...
51324,This drug literally changed my life. Before I...,5.0,Female,Nose/throat dryness or irritation blood -ting...
163162,I have been taking the pill for about 3 months...,5.0,Female,Flushing/warmth redness itching and burning ...
215722,,,,
219503,just started. I think I have felt a calming of...,5.0,Female,Drowsiness loss of coordination and dizziness...
294488,The one strange side effect is - it has colore...,5.0,Female,Diarrhea headache or nausea may occur.'
294734,so far i believe the drug is working but my mo...,3.0,Female,Diarrhea headache or nausea may occur.'
298207,I hope I never stop taking this med. It takes ...,5.0,Female,Drowsiness dizziness dry mouth constipati...
298438,I have compressed nerve in my lower back due t...,5.0,Female,Drowsiness dizziness dry mouth constipati...
299110,2 years ago I took lovaza and my colestrol wen...,2.0,Female,Upset stomach burping and strange taste in m...


In [320]:
invalid_sex_fixed.rename(columns={index:value for index, value in enumerate(columns_invalid_sex)}, inplace=True)

In [322]:
invalid_sex_fixed[:15]

Unnamed: 0,Reviews,Satisfaction,Sex,Sides
31058,I find that on ehalf hour is when it starts to...,4.0,Male,Headache flushing stuffy/ runny nose or diz...
51324,This drug literally changed my life. Before I...,5.0,Female,Nose/throat dryness or irritation blood -ting...
163162,I have been taking the pill for about 3 months...,5.0,Female,Flushing/warmth redness itching and burning ...
215722,,,,
219503,just started. I think I have felt a calming of...,5.0,Female,Drowsiness loss of coordination and dizziness...
294488,The one strange side effect is - it has colore...,5.0,Female,Diarrhea headache or nausea may occur.'
294734,so far i believe the drug is working but my mo...,3.0,Female,Diarrhea headache or nausea may occur.'
298207,I hope I never stop taking this med. It takes ...,5.0,Female,Drowsiness dizziness dry mouth constipati...
298438,I have compressed nerve in my lower back due t...,5.0,Female,Drowsiness dizziness dry mouth constipati...
299110,2 years ago I took lovaza and my colestrol wen...,2.0,Female,Upset stomach burping and strange taste in m...


In [323]:
get_nan_index_in_invalid_sex_fixed = invalid_sex_fixed[invalid_sex_fixed['Sex'].isna()].index
get_nan_index_in_invalid_sex_fixed

Index([215722, 306577, 306579, 307538, 307540, 308658, 324376, 324446, 326417,
       337005, 347550, 350319, 351953, 353926, 355615, 355720, 355729],
      dtype='int64')

In [324]:
for value in joined_strings.loc[get_nan_index_in_invalid_sex_fixed]:
    print(value)

I can\t take it at night it causes my blood sugar to bottom out,2, 'Nausea  and  upset stomach  may occur.
I have severe Scoliosis,I\m 61yrs I was diagnosed whenot I was 11yrs old.I never had surgery iwore a backbrace 4yrs after all I went threw finally ended up in a pain clinic.Been on 3 different medsnow I take a 15mg it gives me good relief.I can now do things I couldnt.I am thankful for the relief,4, 'Nausea ,  vomiting ,  constipation ,  lightheadedness ,  dizziness , or drowsiness may occur.
I have severe Scoliosis,I\m 61yrs I was diagnosed whenot I was 11yrs old.I never had surgery iwore a backbrace 4yrs after all I went threw finally ended up in a pain clinic.Been on 3 different medsnow I take a 15mg it gives me good relief.I can now do things I couldnt.I am thankful for the relief,4, 'Nausea ,  vomiting ,  constipation ,  lightheadedness ,  dizziness , or drowsiness may occur.
I have severe Scoliosis,I\m 61yrs I was diagnosed whenot I was 11yrs old.I never had surgery iwore a 

In [325]:
extracted_nan_index_in_invalid_sex_fixed = joined_strings.loc[get_nan_index_in_invalid_sex_fixed].str.extract(r'^(.+?),(\d+),(.+)$')

In [326]:
extracted_nan_index_in_invalid_sex_fixed

Unnamed: 0,0,1,2
215722,I can\t take it at night it causes my blood su...,2,'Nausea and upset stomach may occur.
306577,"I have severe Scoliosis,I\m 61yrs I was diagno...",4,"'Nausea , vomiting , constipation , lighth..."
306579,"I have severe Scoliosis,I\m 61yrs I was diagno...",4,"'Nausea , vomiting , constipation , lighth..."
307538,"I have severe Scoliosis,I\m 61yrs I was diagno...",4,"'Nausea , vomiting , constipation , lighth..."
307540,"I have severe Scoliosis,I\m 61yrs I was diagno...",4,"'Nausea , vomiting , constipation , dry mo..."
308658,when I first had the nexplanon put in my arm I...,1,"'Nausea , stomach cramping/ bloating , diz..."
324376,"5This medication is a God send, it totally con...",4,"'Constipation , drowsiness, upset stomach , ..."
324446,"5This medication is a God send, it totally con...",4,"'Constipation , drowsiness, upset stomach , ..."
326417,It has burnt my 2 leg\s to the bone on April 2...,1,'
337005,"Hi everyone, \nI just started using this cream...",3,"'Burning, stinging, tingling, and redness of ..."


In [327]:
invalid_sex_fixed.loc[extracted_nan_index_in_invalid_sex_fixed.index, ['Reviews', 'Satisfaction', 'Sides']] = extracted_nan_index_in_invalid_sex_fixed.values

In [328]:
invalid_sex_fixed.loc[extracted_nan_index_in_invalid_sex_fixed.index]

Unnamed: 0,Reviews,Satisfaction,Sex,Sides
215722,I can\t take it at night it causes my blood su...,2,,'Nausea and upset stomach may occur.
306577,"I have severe Scoliosis,I\m 61yrs I was diagno...",4,,"'Nausea , vomiting , constipation , lighth..."
306579,"I have severe Scoliosis,I\m 61yrs I was diagno...",4,,"'Nausea , vomiting , constipation , lighth..."
307538,"I have severe Scoliosis,I\m 61yrs I was diagno...",4,,"'Nausea , vomiting , constipation , lighth..."
307540,"I have severe Scoliosis,I\m 61yrs I was diagno...",4,,"'Nausea , vomiting , constipation , dry mo..."
308658,when I first had the nexplanon put in my arm I...,1,,"'Nausea , stomach cramping/ bloating , diz..."
324376,"5This medication is a God send, it totally con...",4,,"'Constipation , drowsiness, upset stomach , ..."
324446,"5This medication is a God send, it totally con...",4,,"'Constipation , drowsiness, upset stomach , ..."
326417,It has burnt my 2 leg\s to the bone on April 2...,1,,'
337005,"Hi everyone, \nI just started using this cream...",3,,"'Burning, stinging, tingling, and redness of ..."


In [329]:
invalid_sex_fixed.shape

(167, 4)

In [330]:
invalid_sex_fixed

Unnamed: 0,Reviews,Satisfaction,Sex,Sides
31058,I find that on ehalf hour is when it starts to...,4,Male,Headache flushing stuffy/ runny nose or diz...
51324,This drug literally changed my life. Before I...,5,Female,Nose/throat dryness or irritation blood -ting...
163162,I have been taking the pill for about 3 months...,5,Female,Flushing/warmth redness itching and burning ...
215722,I can\t take it at night it causes my blood su...,2,,'Nausea and upset stomach may occur.
219503,just started. I think I have felt a calming of...,5,Female,Drowsiness loss of coordination and dizziness...
...,...,...,...,...
361380,I to like the others applied it to my shoulder...,1,Female,Warmth stinging or burning on the application ...
361851,Can\t sleep at night and difficult to urinate,1,Female,Nausea dry mouth loss of appetite tiredness...
361952,so far i believe the drug is working but my mo...,3,Female,Diarrhea nausea vomiting headache or d...
361967,The one strange side effect is - it has colore...,5,Female,Diarrhea nausea vomiting headache or d...


In [332]:
concat_df.loc[invalid_sex_fixed.index, invalid_sex_fixed.columns]

Unnamed: 0,Reviews,Satisfaction,Sex,Sides
31058,I find that on ehalf hour is when it starts to...,flushing,stuffy/ runny nose,or dizziness may occur.'
51324,This drug literally changed my life. Before I...,",5,Female,Nose/throat dryness or irritation",blood -tinged mucus /phlegm,and nosebleeds may occur.'
163162,I have been taking the pill for about 3 months...,redness,itching,and burning feeling of the skin may occur.'
215722,I can\t take it at night,it causes my blood sugar to bottom out,",2, '",Nausea and upset stomach may occur.
219503,just started. I think I have felt a calming of...,",5,Female,Drowsiness",loss of coordination,and dizziness may occur.'
...,...,...,...,...
361380,I to like the others applied it to my shoulder...,",1,Female,Warmth",stinging,or burning on the application site may occur.'
361851,"Can\t sleep at night and difficult to urinate,...",sweating,blurred vision,and yawning may occur.'
361952,so far i believe the drug is working but my mo...,vomiting,headache,or diaper rash in young children may occur.'
361967,The one strange side effect is - it has colore...,vomiting,headache,or diaper rash in young children may occur.'


In [333]:
concat_df.loc[invalid_sex_fixed.index, invalid_sex_fixed.columns] = invalid_sex_fixed.values

In [335]:
concat_df.loc[invalid_sex_fixed.index]

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
31058,65-74,Inability to have an Erection,7/29/2009,levitra,76765,5,4,I find that on ehalf hour is when it starts to...,4,Male,Headache flushing stuffy/ runny nose or diz...,11
51324,45-54,Allergic Rhinitis Prevention,5/1/2014,"nasonex aerosol, spray with pump",4820,5,5,This drug literally changed my life. Before I...,5,Female,Nose/throat dryness or irritation blood -ting...,14
163162,45-54,Multiple Sclerosis Symptoms Return then Become...,10/31/2013,tecfidera,163868,5,5,I have been taking the pill for about 3 months...,5,Female,Flushing/warmth redness itching and burning ...,14
215722,65-74,Type 2 Diabetes Mellitus,4/12/2014,glimepiride,12271,5,2,I can\t take it at night it causes my blood su...,2,,'Nausea and upset stomach may occur.,17
219503,35-44,Neuropathic Pain,9/29/2009,"gabapentin tablet, extended release 24 hr",14208,5,3,just started. I think I have felt a calming of...,5,Female,Drowsiness loss of coordination and dizziness...,20
...,...,...,...,...,...,...,...,...,...,...,...,...
361380,45-54,Backache,3/1/2011,capzasin-hp cream,61857,1,1,I to like the others applied it to my shoulder...,1,Female,Warmth stinging or burning on the application ...,1
361851,55-64,Depression,1/7/2012,celexa,8603,1,1,Can\t sleep at night and difficult to urinate,1,Female,Nausea dry mouth loss of appetite tiredness...,5
361952,55-64,Acute Maxillary Sinus S. Pneumoniae Bacteria I...,12/8/2007,cefdinir,5543,5,3,so far i believe the drug is working but my mo...,3,Female,Diarrhea nausea vomiting headache or d...,9
361967,75 or over,Acute Maxillary Sinus S. Pneumoniae Bacteria I...,1/27/2012,cefdinir,5543,5,4,The one strange side effect is - it has colore...,5,Female,Diarrhea nausea vomiting headache or d...,1


In [337]:
concat_df['Sex'].value_counts()

Sex
Female    238226
Male       98043
           26520
Name: count, dtype: int64

In [338]:
concat_df['Satisfaction'].value_counts()

Satisfaction
5     111550
1     100901
4      63158
3      51852
2      35342
6          2
10         1
Name: count, dtype: int64

In [144]:
# empty_string_columns = concat_df.columns[concat_df.apply(lambda col: col.str.match(r'^\s*$').any())]

# print(empty_string_columns)

# concat_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)