In [34]:
# Dependencies
import pandas as pd # type: ignore
from pathlib import Path
import pyreadstat  # type: ignore
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
from scipy.stats import linregress

## Reading in Datasets & Basic Data Exploration:

### Prescription Medications: 

In [35]:
Prescription_Meds = pd.read_sas("Datasets\Prescription_meds.xpt")
Prescription_Meds.head()

Unnamed: 0,SEQN,RXDUSE,RXDDRUG,RXDDRGID,RXQSEEN,RXDDAYS,RXDRSC1,RXDRSC2,RXDRSC3,RXDRSD1,RXDRSD2,RXDRSD3,RXDCOUNT
0,109263.0,2.0,b'',b'',,,b'',b'',b'',b'',b'',b'',
1,109264.0,2.0,b'',b'',,,b'',b'',b'',b'',b'',b'',
2,109265.0,2.0,b'',b'',,,b'',b'',b'',b'',b'',b'',
3,109266.0,2.0,b'',b'',,,b'',b'',b'',b'',b'',b'',
4,109267.0,1.0,b'99999',b'',,,b'',b'',b'',b'',b'',b'',1.0


In [36]:
#df = pyreadstat.read_xport('Datasets\Prescription_meds.xpt', metadataonly=True)
Prescription_Meds.info()
print(f"Rows, Columns: {Prescription_Meds.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32962 entries, 0 to 32961
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      32962 non-null  float64
 1   RXDUSE    32962 non-null  float64
 2   RXDDRUG   32962 non-null  object 
 3   RXDDRGID  32962 non-null  object 
 4   RXQSEEN   24037 non-null  float64
 5   RXDDAYS   24031 non-null  float64
 6   RXDRSC1   32962 non-null  object 
 7   RXDRSC2   32962 non-null  object 
 8   RXDRSC3   32962 non-null  object 
 9   RXDRSD1   32962 non-null  object 
 10  RXDRSD2   32962 non-null  object 
 11  RXDRSD3   32962 non-null  object 
 12  RXDCOUNT  24246 non-null  float64
dtypes: float64(5), object(8)
memory usage: 3.3+ MB
Rows, Columns: (32962, 13)


### Physical Activity:

In [37]:
Physical_Activity = pd.read_sas("Datasets\Phys_Activity.xpt")
Physical_Activity.head()

Unnamed: 0,SEQN,PAQ605,PAQ610,PAD615,PAQ620,PAQ625,PAD630,PAQ635,PAQ640,PAD645,PAQ650,PAQ655,PAD660,PAQ665,PAQ670,PAD675,PAD680
0,109266.0,2.0,,,2.0,,,2.0,,,1.0,5.0,60.0,1.0,4.0,30.0,480.0
1,109267.0,2.0,,,2.0,,,2.0,,,1.0,6.0,90.0,2.0,,,540.0
2,109268.0,1.0,5.0,540.0,1.0,5.0,300.0,2.0,,,2.0,,,2.0,,,540.0
3,109271.0,2.0,,,1.0,2.0,120.0,2.0,,,2.0,,,2.0,,,60.0
4,109273.0,1.0,3.0,240.0,2.0,,,2.0,,,2.0,,,1.0,4.0,120.0,180.0


In [38]:
Physical_Activity.info()
print(f"Rows, Columns: {Physical_Activity.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9693 entries, 0 to 9692
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SEQN    9693 non-null   float64
 1   PAQ605  9693 non-null   float64
 2   PAQ610  2418 non-null   float64
 3   PAD615  2402 non-null   float64
 4   PAQ620  9693 non-null   float64
 5   PAQ625  4187 non-null   float64
 6   PAD630  4164 non-null   float64
 7   PAQ635  9693 non-null   float64
 8   PAQ640  2253 non-null   float64
 9   PAD645  2241 non-null   float64
 10  PAQ650  9693 non-null   float64
 11  PAQ655  2422 non-null   float64
 12  PAD660  2417 non-null   float64
 13  PAQ665  9693 non-null   float64
 14  PAQ670  3904 non-null   float64
 15  PAD675  3893 non-null   float64
 16  PAD680  9676 non-null   float64
dtypes: float64(17)
memory usage: 1.3 MB
Rows, Columns: (9693, 17)


### Demographics Data:

In [39]:
Demographics = pd.read_sas("Datasets\Demograpics.xpt")
Demographics.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,DMDBORN4,...,FIAINTRP,MIALANG,MIAPROXY,MIAINTRP,AIALANGA,WTINTPRP,WTMECPRP,SDMVPSU,SDMVSTRA,INDFMPIR
0,109263.0,66.0,2.0,1.0,2.0,,5.0,6.0,2.0,1.0,...,2.0,,,,,7891.762435,8951.816,3.0,156.0,4.66
1,109264.0,66.0,2.0,2.0,13.0,,1.0,1.0,2.0,1.0,...,2.0,1.0,2.0,2.0,1.0,11689.747264,12271.16,1.0,155.0,0.83
2,109265.0,66.0,2.0,1.0,2.0,,3.0,3.0,2.0,1.0,...,2.0,,,,,16273.825939,16658.76,1.0,157.0,3.06
3,109266.0,66.0,2.0,2.0,29.0,,5.0,6.0,2.0,2.0,...,2.0,1.0,2.0,2.0,1.0,7825.646112,8154.968,2.0,168.0,5.0
4,109267.0,66.0,1.0,2.0,21.0,,2.0,2.0,,2.0,...,2.0,,,,,26379.991724,5.397605e-79,1.0,156.0,5.0


In [40]:
Demographics.info()
print(f"Rows, Columns: {Demographics.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15560 entries, 0 to 15559
Data columns (total 29 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      15560 non-null  float64
 1   SDDSRVYR  15560 non-null  float64
 2   RIDSTATR  15560 non-null  float64
 3   RIAGENDR  15560 non-null  float64
 4   RIDAGEYR  15560 non-null  float64
 5   RIDAGEMN  987 non-null    float64
 6   RIDRETH1  15560 non-null  float64
 7   RIDRETH3  15560 non-null  float64
 8   RIDEXMON  14300 non-null  float64
 9   DMDBORN4  15560 non-null  float64
 10  DMDYRUSZ  3028 non-null   float64
 11  DMDEDUC2  9232 non-null   float64
 12  DMDMARTZ  9232 non-null   float64
 13  RIDEXPRG  1874 non-null   float64
 14  SIALANG   15560 non-null  float64
 15  SIAPROXY  15560 non-null  float64
 16  SIAINTRP  15560 non-null  float64
 17  FIALANG   14481 non-null  float64
 18  FIAPROXY  14481 non-null  float64
 19  FIAINTRP  14481 non-null  float64
 20  MIALANG   11000 non-null  fl

### Sleep Disorders: 

In [41]:
Sleep_Disorders = pd.read_sas("Datasets\Sleep_Disorders.xpt")
Sleep_Disorders.head()

Unnamed: 0,SEQN,SLQ300,SLQ310,SLD012,SLQ320,SLQ330,SLD013,SLQ030,SLQ040,SLQ050,SLQ120
0,109266.0,b'22:00',b'05:30',7.5,b'23:00',b'07:00',8.0,1.0,5.397605e-79,2.0,5.397605e-79
1,109267.0,b'00:00',b'08:00',8.0,b'03:00',b'11:00',8.0,5.397605e-79,5.397605e-79,2.0,2.0
2,109268.0,b'22:00',b'06:30',8.5,b'23:00',b'07:00',8.0,5.397605e-79,5.397605e-79,2.0,1.0
3,109271.0,b'23:00',b'09:00',10.0,b'23:00',b'12:00',13.0,5.397605e-79,5.397605e-79,1.0,3.0
4,109273.0,b'08:00',b'14:35',6.5,b'21:00',b'05:00',8.0,5.397605e-79,5.397605e-79,1.0,2.0


In [42]:
print(Sleep_Disorders.columns)

Index(['SEQN', 'SLQ300', 'SLQ310', 'SLD012', 'SLQ320', 'SLQ330', 'SLD013',
       'SLQ030', 'SLQ040', 'SLQ050', 'SLQ120'],
      dtype='object')


In [43]:
Sleep_Disorders.info()
print(f"Rows, Columns: {Sleep_Disorders.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10195 entries, 0 to 10194
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SEQN    10195 non-null  float64
 1   SLQ300  10195 non-null  object 
 2   SLQ310  10195 non-null  object 
 3   SLD012  10105 non-null  float64
 4   SLQ320  10195 non-null  object 
 5   SLQ330  10195 non-null  object 
 6   SLD013  10099 non-null  float64
 7   SLQ030  10195 non-null  float64
 8   SLQ040  10195 non-null  float64
 9   SLQ050  10195 non-null  float64
 10  SLQ120  10195 non-null  float64
dtypes: float64(7), object(4)
memory usage: 876.3+ KB
Rows, Columns: (10195, 11)


## Initial Observations and concerns:

The datasets appear to vary widely in the number of rows/observations the contain, ranging from 32962 rows (Prescription_Meds) down to 9693 rows (Physical_Activity). Things are further complicated by the fact that we plan to merge the datasets based on the SEQN column values (respondant sequence numbers) in order to match up specific respondants across datasets. Thus, if we want to analyze data from a column in Physical_Activity alongside data from the other larger datasets, we will either be left with a bunch of missing/empty values or lose quite a bit of our sample size. A partial remedy to this could be creating several merged datasets that are unique to a specific analysis we want to preform. Similarly, we could just pull out specific columns of interest and create a seperate dataframe for an analysis. Lastly, there are also a number of variables that are coded in a convoluted way, as is usually the case with questionaire data. In light of these considerations it is clear that we will have quite a bit of data wranging and cleaning to do before we can begin our analyses.

I am going to merge all of the datasets together below, making sure to use a left join to retain the max number of rows (32962) even though there will be many empty rows



## Initial Merge of Prescription_Meds, Demographics, Physical_Activity, Sleep_Disorders:

### Left Join 

In [44]:
# Beginning first merge: Prescription_Meds, Demographics,
First_Merge = pd.merge(Prescription_Meds, Demographics, on=["SEQN"], how = 'left')
print(f"First merge: {First_Merge.shape}")  

# Second merge to add Physical_Activity:
Second_Merge = pd.merge(First_Merge, Physical_Activity, on=["SEQN"], how = 'left')
print(f"Second merge: {Second_Merge.shape}")  

# Final merge to add Sleep_Disorders:
Merged_Data = pd.merge(Second_Merge, Sleep_Disorders, on=["SEQN"], how = 'left')
print(f"Final merge: {Merged_Data.shape}")  

First merge: (32962, 41)
Second merge: (32962, 57)
Final merge: (32962, 67)


## Exploring the Merged Data

In [45]:
Merged_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32962 entries, 0 to 32961
Data columns (total 67 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      32962 non-null  float64
 1   RXDUSE    32962 non-null  float64
 2   RXDDRUG   32962 non-null  object 
 3   RXDDRGID  32962 non-null  object 
 4   RXQSEEN   24037 non-null  float64
 5   RXDDAYS   24031 non-null  float64
 6   RXDRSC1   32962 non-null  object 
 7   RXDRSC2   32962 non-null  object 
 8   RXDRSC3   32962 non-null  object 
 9   RXDRSD1   32962 non-null  object 
 10  RXDRSD2   32962 non-null  object 
 11  RXDRSD3   32962 non-null  object 
 12  RXDCOUNT  24246 non-null  float64
 13  SDDSRVYR  32962 non-null  float64
 14  RIDSTATR  32962 non-null  float64
 15  RIAGENDR  32962 non-null  float64
 16  RIDAGEYR  32962 non-null  float64
 17  RIDAGEMN  1057 non-null   float64
 18  RIDRETH1  32962 non-null  float64
 19  RIDRETH3  32962 non-null  float64
 20  RIDEXMON  30449 non-null  fl

#### As stated previously, we can always modify how we manipulate specific data/columns to preseve data later on. 

## Exporting the Merged Dataframe into a CSV File

In [46]:
print(Merged_Data.columns)

Index(['SEQN', 'RXDUSE', 'RXDDRUG', 'RXDDRGID', 'RXQSEEN', 'RXDDAYS',
       'RXDRSC1', 'RXDRSC2', 'RXDRSC3', 'RXDRSD1', 'RXDRSD2', 'RXDRSD3',
       'RXDCOUNT', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN',
       'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'DMDBORN4', 'DMDYRUSZ', 'DMDEDUC2',
       'DMDMARTZ', 'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG',
       'FIAPROXY', 'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA',
       'WTINTPRP', 'WTMECPRP', 'SDMVPSU', 'SDMVSTRA', 'INDFMPIR', 'PAQ605',
       'PAQ610', 'PAD615', 'PAQ620', 'PAQ625', 'PAD630', 'PAQ635', 'PAQ640',
       'PAD645', 'PAQ650', 'PAQ655', 'PAD660', 'PAQ665', 'PAQ670', 'PAD675',
       'PAD680', 'SLQ300', 'SLQ310', 'SLD012', 'SLQ320', 'SLQ330', 'SLD013',
       'SLQ030', 'SLQ040', 'SLQ050', 'SLQ120'],
      dtype='object')


In [47]:
# Push the remade DataFrame to a new CSV file
Merged_Data.to_csv("Datasets/Merged_Dataset",
                  encoding="utf-8", index=False, header=True)

## Data Cleaning: 
### Importing Merged Dataset - Columns of Interest

In [48]:
# (By the row in the read_csv function code): prescription meds, then demographics, then physical activity, then sleep disorders 
Distilled_Data = pd.read_csv("Datasets/Merged_Dataset", usecols=['SEQN',
                                                                 'RXDUSE', 'RXDDAYS', 'RXDCOUNT',
                                                                 'RIAGENDR', 'RIDAGEYR', 'RIDRETH3', 'DMDEDUC2', 'DMDMARTZ', 'WTINTPRP', 'WTMECPRP', 'INDFMPIR',
                                                                 'PAQ605', 'PAQ610', 'PAD615', 'PAQ620', 'PAQ625', 'PAD630', 'PAQ635', 'PAQ640', 'PAD645', 'PAQ650', 'PAQ655', 'PAD660', 'PAQ665', 'PAQ670', 'PAD675', 'PAD680',
                                                                 'SLQ300', 'SLQ310', 'SLD012', 'SLQ320', 'SLQ330', 'SLD013','SLQ030', 'SLQ040', 'SLQ050', 'SLQ120'
                                                                 ])

Distilled_Data.head()


Unnamed: 0,SEQN,RXDUSE,RXDDAYS,RXDCOUNT,RIAGENDR,RIDAGEYR,RIDRETH3,DMDEDUC2,DMDMARTZ,WTINTPRP,...,SLQ300,SLQ310,SLD012,SLQ320,SLQ330,SLD013,SLQ030,SLQ040,SLQ050,SLQ120
0,109263.0,2.0,,,1.0,2.0,6.0,,,7891.762435,...,,,,,,,,,,
1,109264.0,2.0,,,2.0,13.0,1.0,,,11689.747264,...,,,,,,,,,,
2,109265.0,2.0,,,1.0,2.0,3.0,,,16273.825939,...,,,,,,,,,,
3,109266.0,2.0,,,2.0,29.0,6.0,5.0,3.0,7825.646112,...,b'22:00',b'05:30',7.5,b'23:00',b'07:00',8.0,1.0,5.397605e-79,2.0,5.397605e-79
4,109267.0,1.0,,1.0,2.0,21.0,2.0,4.0,3.0,26379.991724,...,b'00:00',b'08:00',8.0,b'03:00',b'11:00',8.0,5.397605e-79,5.397605e-79,2.0,2.0


## Renaming Variables 

In [49]:
Distilled_Data_Renamed = Distilled_Data.rename(columns= {'SEQN': "Respondent ID", 
                                                         'RXDUSE': "Taken Rx Past Month" , 
                                                         'RXDDAYS': "Number Days Taken Rx", 
                                                         'RXDCOUNT': "Number of Rx Meds", 
                                                         'RIAGENDR': "Gender", 
                                                         'RIDAGEYR': "Age at Screening", 
                                                         'RIDRETH3': "Race", 
                                                         'DMDEDUC2': "Educational Attainment", 
                                                         'DMDMARTZ': "Marital Status", 
                                                         'WTINTPRP': "Sample Interview Weight", 
                                                         'WTMECPRP': "MEC Sample Weight",
                                                         'INDFMPIR': "Family Income to Poverty Ratio", 
                                                         'PAQ605': "Vigorous Work Activity", 
                                                         'PAQ610': "Number of Days Vigorous Work", 
                                                         'PAD615': "Minutes of Vigorous Work Daily", 
                                                         'PAQ620': "Moderate Work Activity",
                                                         'PAQ625': "Number of Days Moderate Work", 
                                                         'PAD630': "Minutes of Moderate Work Daily", 
                                                         'PAQ635': "Walk or Bike Travel", 
                                                         'PAQ640': "Number of Days Walk or Bike Weekly", 
                                                         'PAD645': "Daily Minutes Walking or Biking", 
                                                         'PAQ650': "Vigorous Recreational Activity", 
                                                         'PAQ655': "Number of Days Vigorous Recreation", 
                                                         'PAD660': "Minutes of Vigorous Recreation Daily", 
                                                         'PAQ665': "Moderate Recreational Activity", 
                                                         'PAQ670': "Number of Days Moderate Recreation", 
                                                         'PAD675': "Minutes of Moderate Recreation Daily", 
                                                         'PAD680': "Minutes of Sedentary Activity Daily",
                                                         'SLQ300': "Bedtime Week/Workdays",
                                                         'SLQ310': "Waketime Week/Workdays", 
                                                         'SLD012': "Sleep Hours Week/Workdays", 
                                                         'SLQ320': "Bedtime Weekends", 
                                                         'SLQ330': "Waketime Weekends", 
                                                         'SLD013': "Sleep Hours Weekends",
                                                         'SLQ030': "Snoring Frequency Past Year", 
                                                         'SLQ040': "Frequency Snort/Stop Breathing Past Year", 
                                                         'SLQ050': "Told Doctor About Sleep Issues", 
                                                         'SLQ120': "Feeling Tired During Day Frequency"

})

Distilled_Data_Renamed.head()

Unnamed: 0,Respondent ID,Taken Rx Past Month,Number Days Taken Rx,Number of Rx Meds,Gender,Age at Screening,Race,Educational Attainment,Marital Status,Sample Interview Weight,...,Bedtime Week/Workdays,Waketime Week/Workdays,Sleep Hours Week/Workdays,Bedtime Weekends,Waketime Weekends,Sleep Hours Weekends,Snoring Frequency Past Year,Frequency Snort/Stop Breathing Past Year,Told Doctor About Sleep Issues,Feeling Tired During Day Frequency
0,109263.0,2.0,,,1.0,2.0,6.0,,,7891.762435,...,,,,,,,,,,
1,109264.0,2.0,,,2.0,13.0,1.0,,,11689.747264,...,,,,,,,,,,
2,109265.0,2.0,,,1.0,2.0,3.0,,,16273.825939,...,,,,,,,,,,
3,109266.0,2.0,,,2.0,29.0,6.0,5.0,3.0,7825.646112,...,b'22:00',b'05:30',7.5,b'23:00',b'07:00',8.0,1.0,5.397605e-79,2.0,5.397605e-79
4,109267.0,1.0,,1.0,2.0,21.0,2.0,4.0,3.0,26379.991724,...,b'00:00',b'08:00',8.0,b'03:00',b'11:00',8.0,5.397605e-79,5.397605e-79,2.0,2.0


In [50]:
# Overview of df and variables:
Distilled_Data_Renamed.describe()
Distilled_Data_Renamed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32962 entries, 0 to 32961
Data columns (total 38 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Respondent ID                             32962 non-null  float64
 1   Taken Rx Past Month                       32962 non-null  float64
 2   Number Days Taken Rx                      24031 non-null  float64
 3   Number of Rx Meds                         24246 non-null  float64
 4   Gender                                    32962 non-null  float64
 5   Age at Screening                          32962 non-null  float64
 6   Race                                      32962 non-null  float64
 7   Educational Attainment                    25746 non-null  float64
 8   Marital Status                            25746 non-null  float64
 9   Sample Interview Weight                   32962 non-null  float64
 10  MEC Sample Weight                 

## Exporting Dataframe as CSV

In [51]:
Distilled_Data_Renamed.to_csv("Datasets/Filtered_Renamed_Data",
                  encoding="utf-8", index=False, header=True)

### Dropping Columns 
#### Melissa 

In [52]:
# Data_Cleaning import 
Dataset = pd.read_csv("Datasets/Filtered_Renamed_Data")

# Create a data frame
Dataset.head()

Unnamed: 0,Respondent ID,Taken Rx Past Month,Number Days Taken Rx,Number of Rx Meds,Gender,Age at Screening,Race,Educational Attainment,Marital Status,Sample Interview Weight,...,Bedtime Week/Workdays,Waketime Week/Workdays,Sleep Hours Week/Workdays,Bedtime Weekends,Waketime Weekends,Sleep Hours Weekends,Snoring Frequency Past Year,Frequency Snort/Stop Breathing Past Year,Told Doctor About Sleep Issues,Feeling Tired During Day Frequency
0,109263.0,2.0,,,1.0,2.0,6.0,,,7891.762435,...,,,,,,,,,,
1,109264.0,2.0,,,2.0,13.0,1.0,,,11689.747264,...,,,,,,,,,,
2,109265.0,2.0,,,1.0,2.0,3.0,,,16273.825939,...,,,,,,,,,,
3,109266.0,2.0,,,2.0,29.0,6.0,5.0,3.0,7825.646112,...,b'22:00',b'05:30',7.5,b'23:00',b'07:00',8.0,1.0,5.397605e-79,2.0,5.397605e-79
4,109267.0,1.0,,1.0,2.0,21.0,2.0,4.0,3.0,26379.991724,...,b'00:00',b'08:00',8.0,b'03:00',b'11:00',8.0,5.397605e-79,5.397605e-79,2.0,2.0


In [53]:
Dataset.columns

Index(['Respondent ID', 'Taken Rx Past Month', 'Number Days Taken Rx',
       'Number of Rx Meds', 'Gender', 'Age at Screening', 'Race',
       'Educational Attainment', 'Marital Status', 'Sample Interview Weight',
       'MEC Sample Weight', 'Family Income to Poverty Ratio',
       'Vigorous Work Activity', 'Number of Days Vigorous Work',
       'Minutes of Vigorous Work Daily', 'Moderate Work Activity',
       'Number of Days Moderate Work', 'Minutes of Moderate Work Daily',
       'Walk or Bike Travel', 'Number of Days Walk or Bike Weekly',
       'Daily Minutes Walking or Biking', 'Vigorous Recreational Activity',
       'Number of Days Vigorous Recreation',
       'Minutes of Vigorous Recreation Daily',
       'Moderate Recreational Activity', 'Number of Days Moderate Recreation',
       'Minutes of Moderate Recreation Daily',
       'Minutes of Sedentary Activity Daily', 'Bedtime Week/Workdays',
       'Waketime Week/Workdays', 'Sleep Hours Week/Workdays',
       'Bedtime Weekend

In [54]:
# Drop extra columns
Dataset_update = Dataset.drop(columns=['Taken Rx Past Month', 'Number Days Taken Rx', 'Sample Interview Weight', 'MEC Sample Weight', 'Vigorous Work Activity', 
                      'Number of Days Vigorous Work', 'Moderate Work Activity', 'Number of Days Moderate Work', 'Walk or Bike Travel', 'Number of Days Walk or Bike Weekly',
                      'Daily Minutes Walking or Biking', 'Marital Status',
                      'Vigorous Recreational Activity', 'Number of Days Vigorous Recreation', 'Moderate Recreational Activity', 
                      'Number of Days Moderate Recreation','Bedtime Week/Workdays', 'Waketime Week/Workdays', 'Bedtime Weekends', 'Waketime Weekends',
                      'Snoring Frequency Past Year', 'Frequency Snort/Stop Breathing Past Year'
                     ])
Dataset_update

Unnamed: 0,Respondent ID,Number of Rx Meds,Gender,Age at Screening,Race,Educational Attainment,Family Income to Poverty Ratio,Minutes of Vigorous Work Daily,Minutes of Moderate Work Daily,Minutes of Vigorous Recreation Daily,Minutes of Moderate Recreation Daily,Minutes of Sedentary Activity Daily,Sleep Hours Week/Workdays,Sleep Hours Weekends,Told Doctor About Sleep Issues,Feeling Tired During Day Frequency
0,109263.0,,1.0,2.0,6.0,,4.66,,,,,,,,,
1,109264.0,,2.0,13.0,1.0,,0.83,,,,,,,,,
2,109265.0,,1.0,2.0,3.0,,3.06,,,,,,,,,
3,109266.0,,2.0,29.0,6.0,5.0,5.00,,,60.0,30.0,480.0,7.5,8.0,2.0,5.397605e-79
4,109267.0,1.0,2.0,21.0,2.0,4.0,5.00,,,90.0,,540.0,8.0,8.0,2.0,2.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32957,124821.0,4.0,1.0,63.0,4.0,2.0,3.71,10.0,,,,60.0,8.0,9.0,2.0,2.000000e+00
32958,124821.0,4.0,1.0,63.0,4.0,2.0,3.71,10.0,,,,60.0,8.0,9.0,2.0,2.000000e+00
32959,124821.0,4.0,1.0,63.0,4.0,2.0,3.71,10.0,,,,60.0,8.0,9.0,2.0,2.000000e+00
32960,124822.0,2.0,1.0,74.0,2.0,3.0,,,180.0,45.0,60.0,240.0,5.5,8.0,2.0,4.000000e+00


In [55]:

Dataset_update.to_csv("Datasets/Dataset_update",
                  encoding="utf-8", index=False, header=True)