In [1]:
from faker import Faker
import pandas as pd
import numpy as np
import datetime
exp = Faker()
Faker.seed(400000)

# Using faker creating the profile

In [2]:
data = [exp.profile() for i in range(400000)]
df = pd.DataFrame(data)

# Dropping

In [3]:
df=df.drop(['job', 'company', 'ssn','job', 'company', 'ssn','address', 'mail', 'residence', 'current_location','website', 'username','birthdate'],axis=1)

# Changing the index

In [4]:
df.index = np.arange(1, len(df)+1)
df

Unnamed: 0,blood_group,name,sex
1,A+,Jennifer Hayes,F
2,B-,George Schultz,M
3,O-,Michael Roach,M
4,A-,Jennifer Bryant,F
5,A-,David Morris,M
...,...,...,...
399996,B+,Sean Bennett,M
399997,O+,Timothy Mckay,M
399998,A-,Cassandra Frank,F
399999,AB+,Craig Cortez,M


# Reading the 1st dataset

In [5]:
data=pd.read_csv(r'C:\Users\elsap\Downloads\1614141926_drugEffects.csv')

In [6]:
data.columns

Index(['Unnamed: 0', 'urlDrugName', 'rating', 'effectiveness', 'condition',
       'sideEffects'],
      dtype='object')

# Dropping

In [7]:
data=data.drop(['Unnamed: 0','rating','effectiveness'],axis=1)

In [8]:
data

Unnamed: 0,urlDrugName,condition,sideEffects
0,enalapril,management of congestive heart failure,Mild Side Effects
1,ortho-tri-cyclen,birth prevention,Severe Side Effects
2,ponstel,menstrual cramps,No Side Effects
3,prilosec,acid reflux,Mild Side Effects
4,lyrica,fibromyalgia,Severe Side Effects
...,...,...,...
3102,vyvanse,adhd,Mild Side Effects
3103,zoloft,depression,Extremely Severe Side Effects
3104,climara,total hysterctomy,Moderate Side Effects
3105,trileptal,epilepsy,Mild Side Effects


In [9]:
data[ 'urlDrugName'].value_counts()

lexapro           63
prozac            46
retin-a           45
zoloft            45
propecia          38
                  ..
flovent            1
metoclopramide     1
antivert           1
bactroban          1
levetiracetam      1
Name: urlDrugName, Length: 502, dtype: int64

# Drug- lexapro

In [10]:
data=data[data['urlDrugName'] == 'lexapro'] # Since more in number

# Index changing 

In [11]:
data.index = np.arange(1, len(data)+1)

In [12]:
data

Unnamed: 0,urlDrugName,condition,sideEffects
1,lexapro,depression,Mild Side Effects
2,lexapro,depression and generalized anxiety,Moderate Side Effects
3,lexapro,major depression/anxiety,Mild Side Effects
4,lexapro,depression,Severe Side Effects
5,lexapro,depression/anxiety,Moderate Side Effects
...,...,...,...
59,lexapro,"anxiety, stress",Extremely Severe Side Effects
60,lexapro,nerves,No Side Effects
61,lexapro,depression,No Side Effects
62,lexapro,depression,Severe Side Effects


# Concating

In [13]:
df=pd.concat([df,data],axis=1)

In [14]:
df

Unnamed: 0,blood_group,name,sex,urlDrugName,condition,sideEffects
1,A+,Jennifer Hayes,F,lexapro,depression,Mild Side Effects
2,B-,George Schultz,M,lexapro,depression and generalized anxiety,Moderate Side Effects
3,O-,Michael Roach,M,lexapro,major depression/anxiety,Mild Side Effects
4,A-,Jennifer Bryant,F,lexapro,depression,Severe Side Effects
5,A-,David Morris,M,lexapro,depression/anxiety,Moderate Side Effects
...,...,...,...,...,...,...
399996,B+,Sean Bennett,M,,,
399997,O+,Timothy Mckay,M,,,
399998,A-,Cassandra Frank,F,,,
399999,AB+,Craig Cortez,M,,,


# Filling null values

In [15]:
df['urlDrugName'].fillna(df['urlDrugName'].mode()[0],inplace=True)

In [16]:
df.columns=['Blood_group','Name','Sex','Drug','condition','SideEffects']

In [17]:
df

Unnamed: 0,Blood_group,Name,Sex,Drug,condition,SideEffects
1,A+,Jennifer Hayes,F,lexapro,depression,Mild Side Effects
2,B-,George Schultz,M,lexapro,depression and generalized anxiety,Moderate Side Effects
3,O-,Michael Roach,M,lexapro,major depression/anxiety,Mild Side Effects
4,A-,Jennifer Bryant,F,lexapro,depression,Severe Side Effects
5,A-,David Morris,M,lexapro,depression/anxiety,Moderate Side Effects
...,...,...,...,...,...,...
399996,B+,Sean Bennett,M,lexapro,,
399997,O+,Timothy Mckay,M,lexapro,,
399998,A-,Cassandra Frank,F,lexapro,,
399999,AB+,Craig Cortez,M,lexapro,,


# Generating the values for side effect

In [18]:
df['SideEffects'].value_counts()

Mild Side Effects                26
No Side Effects                  14
Moderate Side Effects            12
Severe Side Effects               6
Extremely Severe Side Effects     5
Name: SideEffects, dtype: int64

In [19]:
d=[26,14,12,6,5]
p=np.divide(d,63)

In [20]:
np.random.seed(444)
df1=np.random.choice(a=['Mild Side Effects','No Side Effects','Moderate Side Effects',
                        'Severe Side Effects','Extremely Severe Side Effects'],size=399937,p=p)

In [21]:
df.loc[df['SideEffects'].isnull(),'SideEffects']=df1

In [22]:
df1=pd.DataFrame(df1)

In [23]:
df1.index = np.arange(1, len(df1)+1)

In [24]:
df

Unnamed: 0,Blood_group,Name,Sex,Drug,condition,SideEffects
1,A+,Jennifer Hayes,F,lexapro,depression,Mild Side Effects
2,B-,George Schultz,M,lexapro,depression and generalized anxiety,Moderate Side Effects
3,O-,Michael Roach,M,lexapro,major depression/anxiety,Mild Side Effects
4,A-,Jennifer Bryant,F,lexapro,depression,Severe Side Effects
5,A-,David Morris,M,lexapro,depression/anxiety,Moderate Side Effects
...,...,...,...,...,...,...
399996,B+,Sean Bennett,M,lexapro,,Mild Side Effects
399997,O+,Timothy Mckay,M,lexapro,,Mild Side Effects
399998,A-,Cassandra Frank,F,lexapro,,No Side Effects
399999,AB+,Craig Cortez,M,lexapro,,No Side Effects


# Generating condition

In [25]:
df['condition'].value_counts()

depression                            29
anxiety                                5
depression/anxiety                     4
anxiety/depression                     2
depression and anxiety                 2
anxiety and depression                 2
anxiety, stress                        1
major depression/anxiety               1
depression & anxiety                   1
depression, lack of motivation         1
anxiety depression                     1
stressfull live change                 1
nerves                                 1
anxiety, depression                    1
depression, ptsd                       1
depression and generalized anxiety     1
panic attacks                          1
depression, anxiety                    1
major depression                       1
anxiety, depresion                     1
clinical depression                    1
general depression/perimenopause       1
panic attacks and depression           1
peri-menopausal syptoms                1
ocd             

In [26]:
df['condition'].unique()

array(['depression', 'depression and generalized anxiety',
       'major depression/anxiety', 'depression/anxiety',
       'depression, lack of motivation', 'peri-menopausal syptoms',
       'anxiety and depression', 'clinical depression',
       'anxiety depression', 'depression and anxiety',
       'anxiety/depression', 'anxiety', 'depression & anxiety',
       'depression, anxiety', 'panic attacks', 'anxiety, depression',
       'major depression', 'general depression/perimenopause', 'ocd',
       'panic attacks and depression', 'depression, ptsd',
       'anxiety, depresion', 'stressfull live change', 'anxiety, stress',
       'nerves', nan], dtype=object)

In [27]:
d11=[29,5,4,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]                               
p11=np.divide(d11,63)

In [28]:
np.random.seed(444)
df11=np.random.choice(a=['depression', 'anxiety','depression/anxiety','depression & anxiety','anxiety/depression','anxiety and depression','depression, anxiety','major depression/anxiety','nerves','depression, ptsd','anxiety, depression','stressfull live change','major depression','anxiety, depresion','depression & anxiety','general depression/perimenopause','panic attacks','peri-menopausal syptoms','anxiety, stress','depression, lack of motivation','anxiety depression','panic attacks and depression','depression and generalized anxiety','ocd','clinical depression'],size=399937,p=p11)

In [29]:
df.loc[df['condition'].isnull(),'condition']=df11

In [30]:
df11=pd.DataFrame(df11)

In [31]:
df11.index = np.arange(1, len(df11)+1)

In [32]:
df

Unnamed: 0,Blood_group,Name,Sex,Drug,condition,SideEffects
1,A+,Jennifer Hayes,F,lexapro,depression,Mild Side Effects
2,B-,George Schultz,M,lexapro,depression and generalized anxiety,Moderate Side Effects
3,O-,Michael Roach,M,lexapro,major depression/anxiety,Mild Side Effects
4,A-,Jennifer Bryant,F,lexapro,depression,Severe Side Effects
5,A-,David Morris,M,lexapro,depression/anxiety,Moderate Side Effects
...,...,...,...,...,...,...
399996,B+,Sean Bennett,M,lexapro,depression,Mild Side Effects
399997,O+,Timothy Mckay,M,lexapro,depression,Mild Side Effects
399998,A-,Cassandra Frank,F,lexapro,depression,No Side Effects
399999,AB+,Craig Cortez,M,lexapro,depression/anxiety,No Side Effects


# Reading 2nd dataset

In [33]:
data1=pd.read_csv(r'C:\Users\elsap\Downloads\drug200.csv')

In [34]:
data1.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [35]:
data1=data1.drop(['Sex', 'Drug','Na_to_K'],axis=1)

In [36]:
data1

Unnamed: 0,Age,BP,Cholesterol
0,23,HIGH,HIGH
1,47,LOW,HIGH
2,47,LOW,HIGH
3,28,NORMAL,HIGH
4,61,LOW,HIGH
...,...,...,...
195,56,LOW,HIGH
196,16,LOW,HIGH
197,52,NORMAL,HIGH
198,23,NORMAL,NORMAL


In [37]:
data1.index = np.arange(1, len(data1)+1)

In [38]:
data1

Unnamed: 0,Age,BP,Cholesterol
1,23,HIGH,HIGH
2,47,LOW,HIGH
3,47,LOW,HIGH
4,28,NORMAL,HIGH
5,61,LOW,HIGH
...,...,...,...
196,56,LOW,HIGH
197,16,LOW,HIGH
198,52,NORMAL,HIGH
199,23,NORMAL,NORMAL


In [39]:
df=pd.concat([df,data1],axis=1)

In [40]:
df

Unnamed: 0,Blood_group,Name,Sex,Drug,condition,SideEffects,Age,BP,Cholesterol
1,A+,Jennifer Hayes,F,lexapro,depression,Mild Side Effects,23.0,HIGH,HIGH
2,B-,George Schultz,M,lexapro,depression and generalized anxiety,Moderate Side Effects,47.0,LOW,HIGH
3,O-,Michael Roach,M,lexapro,major depression/anxiety,Mild Side Effects,47.0,LOW,HIGH
4,A-,Jennifer Bryant,F,lexapro,depression,Severe Side Effects,28.0,NORMAL,HIGH
5,A-,David Morris,M,lexapro,depression/anxiety,Moderate Side Effects,61.0,LOW,HIGH
...,...,...,...,...,...,...,...,...,...
399996,B+,Sean Bennett,M,lexapro,depression,Mild Side Effects,,,
399997,O+,Timothy Mckay,M,lexapro,depression,Mild Side Effects,,,
399998,A-,Cassandra Frank,F,lexapro,depression,No Side Effects,,,
399999,AB+,Craig Cortez,M,lexapro,depression/anxiety,No Side Effects,,,


# Generating Age

In [41]:
df['Age'].value_counts()

47.0    8
23.0    7
49.0    7
28.0    7
32.0    6
39.0    6
50.0    5
58.0    5
60.0    5
22.0    5
37.0    5
74.0    4
45.0    4
68.0    4
41.0    4
56.0    4
51.0    4
26.0    4
36.0    4
42.0    4
57.0    4
24.0    4
31.0    4
65.0    4
61.0    4
67.0    4
34.0    4
72.0    4
20.0    4
43.0    4
18.0    3
15.0    3
38.0    3
69.0    3
40.0    3
70.0    3
16.0    3
35.0    3
64.0    3
59.0    3
53.0    3
46.0    2
52.0    2
48.0    2
73.0    2
19.0    2
55.0    2
29.0    2
62.0    2
66.0    2
25.0    1
21.0    1
30.0    1
54.0    1
17.0    1
33.0    1
63.0    1
Name: Age, dtype: int64

In [42]:
r1=[8,7,7,7,6,6,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4
    ,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,
    1,1,1,1,1,1,1]
q1=np.divide(r1,200)

In [43]:
np.random.seed(444)
df4=np.random.choice(a=[47,23,49,28,32,39,50,58,60,22,37,74,45,68,41,56,51,26,
                        36,42,57,24,31,65,61,67,34,72,20,43,18,15,38,69,40,70,
                        16,35,64,59,53,46,52,48,73,19,55,29,62,66,25,21,30,54,17,33,63],size=399800,p=q1)

In [44]:
df.loc[df['Age'].isnull(),'Age']=df4

# Making dataframe

In [45]:
df4=pd.DataFrame(df4)

# Changing the index

In [46]:
df4.index = np.arange(1, len(df4)+1)

# Generating BP,Cholesrol and Na to K

In [47]:
df['BP'].value_counts()

HIGH      77
LOW       64
NORMAL    59
Name: BP, dtype: int64

In [48]:
r2=[77,64,59]
q2=np.divide(r2,200)

In [49]:
np.random.seed(444)
df6=np.random.choice(a=['HIGH','LOW','NORMAL'],size=399800,p=q2)

In [50]:
df.loc[df['BP'].isnull(),'BP']=df6

In [51]:
df6=pd.DataFrame(df6)
df6.index = np.arange(1, len(df6)+1)

In [52]:
df['Cholesterol'].value_counts()

HIGH      103
NORMAL     97
Name: Cholesterol, dtype: int64

In [53]:
r3=[103,97]
q3=np.divide(r3,200)

In [54]:
np.random.seed(444)
df7=np.random.choice(a=['HIGH','NORMAL'],size=399800,p=q3)

In [55]:
df.loc[df['Cholesterol'].isnull(),'Cholesterol']=df7

In [56]:
df7=pd.DataFrame(df7)
df7.index = np.arange(1, len(df7)+1)

In [57]:
df

Unnamed: 0,Blood_group,Name,Sex,Drug,condition,SideEffects,Age,BP,Cholesterol
1,A+,Jennifer Hayes,F,lexapro,depression,Mild Side Effects,23.0,HIGH,HIGH
2,B-,George Schultz,M,lexapro,depression and generalized anxiety,Moderate Side Effects,47.0,LOW,HIGH
3,O-,Michael Roach,M,lexapro,major depression/anxiety,Mild Side Effects,47.0,LOW,HIGH
4,A-,Jennifer Bryant,F,lexapro,depression,Severe Side Effects,28.0,NORMAL,HIGH
5,A-,David Morris,M,lexapro,depression/anxiety,Moderate Side Effects,61.0,LOW,HIGH
...,...,...,...,...,...,...,...,...,...
399996,B+,Sean Bennett,M,lexapro,depression,Mild Side Effects,47.0,HIGH,HIGH
399997,O+,Timothy Mckay,M,lexapro,depression,Mild Side Effects,36.0,LOW,HIGH
399998,A-,Cassandra Frank,F,lexapro,depression,No Side Effects,34.0,LOW,NORMAL
399999,AB+,Craig Cortez,M,lexapro,depression/anxiety,No Side Effects,26.0,LOW,HIGH


# 3rd dataset

In [60]:
df_1=pd.read_csv(r'webmd.csv.zip')

In [61]:
df_1=df_1.drop(['Age','Condition','Date','Drug','DrugId','Reviews','Sex','Sides','UsefulCount'],axis=1)

In [62]:
df_1.index = np.arange(1, len(df_1)+1)

In [63]:
df=pd.concat([df,df_1],axis=1)

In [64]:
df

Unnamed: 0,Blood_group,Name,Sex,Drug,condition,SideEffects,Age,BP,Cholesterol,EaseofUse,Effectiveness,Satisfaction
1,A+,Jennifer Hayes,F,lexapro,depression,Mild Side Effects,23.0,HIGH,HIGH,5.0,5.0,5.0
2,B-,George Schultz,M,lexapro,depression and generalized anxiety,Moderate Side Effects,47.0,LOW,HIGH,5.0,5.0,5.0
3,O-,Michael Roach,M,lexapro,major depression/anxiety,Mild Side Effects,47.0,LOW,HIGH,2.0,3.0,3.0
4,A-,Jennifer Bryant,F,lexapro,depression,Severe Side Effects,28.0,NORMAL,HIGH,2.0,2.0,1.0
5,A-,David Morris,M,lexapro,depression/anxiety,Moderate Side Effects,61.0,LOW,HIGH,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
399996,B+,Sean Bennett,M,lexapro,depression,Mild Side Effects,47.0,HIGH,HIGH,,,
399997,O+,Timothy Mckay,M,lexapro,depression,Mild Side Effects,36.0,LOW,HIGH,,,
399998,A-,Cassandra Frank,F,lexapro,depression,No Side Effects,34.0,LOW,NORMAL,,,
399999,AB+,Craig Cortez,M,lexapro,depression/anxiety,No Side Effects,26.0,LOW,HIGH,,,


In [65]:
df_1['EaseofUse'].value_counts()

5     192650
4      74732
3      41303
1      35927
2      18191
6          2
10         1
Name: EaseofUse, dtype: int64

In [66]:
r_1=[192650,74732,41303,35927,18191,2,1]
q_1=np.divide(r_1,362806)

In [67]:
np.random.seed(444)
df_11=np.random.choice(a=[5,4,3,1,2,6,10],size=37194,p=q_1)

In [68]:
df.loc[df['EaseofUse'].isnull(),'EaseofUse']=df_11

In [69]:
df_11=pd.DataFrame(df_11)

# Effectiveness         

In [70]:
df_1['Effectiveness'].value_counts()

5     130388
4      81821
3      60406
1      59387
2      30801
6          2
10         1
Name: Effectiveness, dtype: int64

In [71]:
r_2=[130388,81821,60406,59387,30801,2,1]
q_2=np.divide(r_2,362806)

In [72]:
np.random.seed(444)
df_12=np.random.choice(a=[5,4,3,1,2,6,10],size=37194,p=q_2)

In [73]:
df.loc[df['Effectiveness'].isnull(),'Effectiveness']=df_12

In [74]:
df_12=pd.DataFrame(df_12)

# Satisfaction

In [75]:
df_1['Satisfaction'].value_counts()

5     111550
1     100901
4      63158
3      51852
2      35342
6          2
10         1
Name: Satisfaction, dtype: int64

In [76]:
r_3=[111550,100901,63158,51852,35342,2,1]
q_3=np.divide(r_3,362806)

In [77]:
np.random.seed(444)
df_13=np.random.choice(a=[5,1,4,3,2,6,10],size=37194,p=q_3)

In [78]:
df.loc[df['Satisfaction'].isnull(),'Satisfaction']=df_13

In [79]:
df_13=pd.DataFrame(df_13)

# Reading the 4th dataset

In [80]:
df3=pd.read_csv(r'C:\Users\elsap\Desktop\quantile_health.csv')

In [81]:
df3

Unnamed: 0,dupersid,totexp,ltotexp,suppins,totchr,age,female,white
0,93193020,3,1.098612,1,0,69,0,1
1,72072017,6,1.791759,1,0,65,1,1
2,25296013,9,2.197225,0,0,85,1,1
3,23628011,14,2.639057,0,0,76,1,1
4,95041014,18,2.890372,0,1,71,1,1
...,...,...,...,...,...,...,...,...
2950,26147015,102303,11.535690,1,4,80,1,1
2951,21857010,104823,11.560030,0,2,69,1,1
2952,93171014,108256,11.592250,1,1,65,0,1
2953,90613022,123611,11.724890,1,3,66,1,1


# Dropping

In [82]:
df3=df3.drop(['dupersid', 'totexp', 'ltotexp', 'suppins', 'totchr', 'female','age'],axis=1)

# Index from one

In [83]:
df3.index = np.arange(1, len(df3)+1)

In [84]:
df=pd.concat([df,df3],axis=1)

In [85]:
df3

Unnamed: 0,white
1,1
2,1
3,1
4,1
5,1
...,...
2951,1
2952,1
2953,1
2954,1


# Generating for race

In [86]:
df3['white'].value_counts()

1    2877
0      78
Name: white, dtype: int64

In [87]:
r8=[2877,78]
q8=np.divide(r8,2955)

In [88]:
np.random.seed(444)
df10=np.random.choice(a=['black','white'],size=400000,p=q8)

# Making dataframe

In [89]:
df10=pd.DataFrame(df10)

# Changing the index

In [90]:
df10.index = np.arange(1, len(df10)+1)

In [91]:
df.loc[df['white'].isnull(),'white']=df10

# Concating the datasets

In [92]:
df=pd.concat([df,df10],axis=1)

In [93]:
df=df.drop(['white'],axis=1)

In [94]:
df.columns=['Blood_group','Name','Sex','Drug','condition','SideEffects','Age','BP','Cholestrol','EaseofUse','Effectiveness','Satisfaction','Race']

In [95]:
df

Unnamed: 0,Blood_group,Name,Sex,Drug,condition,SideEffects,Age,BP,Cholestrol,EaseofUse,Effectiveness,Satisfaction,Race
1,A+,Jennifer Hayes,F,lexapro,depression,Mild Side Effects,23.0,HIGH,HIGH,5.0,5.0,5.0,black
2,B-,George Schultz,M,lexapro,depression and generalized anxiety,Moderate Side Effects,47.0,LOW,HIGH,5.0,5.0,5.0,black
3,O-,Michael Roach,M,lexapro,major depression/anxiety,Mild Side Effects,47.0,LOW,HIGH,2.0,3.0,3.0,black
4,A-,Jennifer Bryant,F,lexapro,depression,Severe Side Effects,28.0,NORMAL,HIGH,2.0,2.0,1.0,black
5,A-,David Morris,M,lexapro,depression/anxiety,Moderate Side Effects,61.0,LOW,HIGH,1.0,1.0,1.0,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...
399996,B+,Sean Bennett,M,lexapro,depression,Mild Side Effects,47.0,HIGH,HIGH,5.0,4.0,1.0,black
399997,O+,Timothy Mckay,M,lexapro,depression,Mild Side Effects,36.0,LOW,HIGH,4.0,4.0,1.0,black
399998,A-,Cassandra Frank,F,lexapro,depression,No Side Effects,34.0,LOW,NORMAL,1.0,2.0,2.0,black
399999,AB+,Craig Cortez,M,lexapro,depression/anxiety,No Side Effects,26.0,LOW,HIGH,5.0,5.0,5.0,black


In [96]:
df.to_csv("Rio-125_classification3.csv",index=False)