In [2]:
# Dependencies
import pandas as pd # type: ignore
from pathlib import Path
import pyreadstat  # type: ignore

## Importing Merged Dataset - Columns of Interest

In [7]:
# (By the row in the read_csv function code): prescription meds, then demographics, then physical activity, then sleep disorders 
Distilled_Data = pd.read_csv("Datasets/Merged_Dataset", usecols=['SEQN',
                                                                 'RXDUSE', 'RXDDAYS', 'RXDCOUNT',
                                                                 'RIAGENDR', 'RIDAGEYR', 'RIDRETH3', 'DMDEDUC2', 'DMDMARTZ', 'WTINTPRP', 'WTMECPRP', 'INDFMPIR',
                                                                 'PAQ605', 'PAQ610', 'PAD615', 'PAQ620', 'PAQ625', 'PAD630', 'PAQ635', 'PAQ640', 'PAD645', 'PAQ650', 'PAQ655', 'PAD660', 'PAQ665', 'PAQ670', 'PAD675', 'PAD680',
                                                                 'SLQ300', 'SLQ310', 'SLD012', 'SLQ320', 'SLQ330', 'SLD013','SLQ030', 'SLQ040', 'SLQ050', 'SLQ120'
                                                                 ])

Distilled_Data.head()


Unnamed: 0,SEQN,RXDUSE,RXDDAYS,RXDCOUNT,RIAGENDR,RIDAGEYR,RIDRETH3,DMDEDUC2,DMDMARTZ,WTINTPRP,...,SLQ300,SLQ310,SLD012,SLQ320,SLQ330,SLD013,SLQ030,SLQ040,SLQ050,SLQ120
0,109263.0,2.0,,,1.0,2.0,6.0,,,7891.762435,...,,,,,,,,,,
1,109264.0,2.0,,,2.0,13.0,1.0,,,11689.747264,...,,,,,,,,,,
2,109265.0,2.0,,,1.0,2.0,3.0,,,16273.825939,...,,,,,,,,,,
3,109266.0,2.0,,,2.0,29.0,6.0,5.0,3.0,7825.646112,...,b'22:00',b'05:30',7.5,b'23:00',b'07:00',8.0,1.0,5.397605e-79,2.0,5.397605e-79
4,109267.0,1.0,,1.0,2.0,21.0,2.0,4.0,3.0,26379.991724,...,b'00:00',b'08:00',8.0,b'03:00',b'11:00',8.0,5.397605e-79,5.397605e-79,2.0,2.0


## Renaming Variables 

In [9]:
Distilled_Data_Renamed = Distilled_Data.rename(columns= {'SEQN': "Respondent ID", 
                                                         'RXDUSE': "Taken Rx Past Month" , 
                                                         'RXDDAYS': "Number Days Taken Rx", 
                                                         'RXDCOUNT': "Number of Rx Meds", 
                                                         'RIAGENDR': "Gender", 
                                                         'RIDAGEYR': "Age at Screening", 
                                                         'RIDRETH3': "Race", 
                                                         'DMDEDUC2': "Educational Attainment", 
                                                         'DMDMARTZ': "Marital Status", 
                                                         'WTINTPRP': "Sample Interview Weight", 
                                                         'WTMECPRP': "MEC Sample Weight",
                                                         'INDFMPIR': "Family Income to Poverty Ratio", 
                                                         'PAQ605': "Vigorous Work Activity", 
                                                         'PAQ610': "Number of Days Vigorous Work", 
                                                         'PAD615': "Minutes of Vigorous Work Daily", 
                                                         'PAQ620': "Moderate Work Activity",
                                                         'PAQ625': "Number of Days Moderate Work", 
                                                         'PAD630': "Minutes of Moderate Work Daily", 
                                                         'PAQ635': "Walk or Bike Travel", 
                                                         'PAQ640': "Number of Days Walk or Bike Weekly", 
                                                         'PAD645': "Daily Minutes Walking or Biking", 
                                                         'PAQ650': "Vigorous Recreational Activity", 
                                                         'PAQ655': "Number of Days Vigorous Recreation", 
                                                         'PAD660': "Minutes of Vigorous Recreation Daily", 
                                                         'PAQ665': "Moderate Recreational Activity", 
                                                         'PAQ670': "Number of Days Moderate Recreation", 
                                                         'PAD675': "Minutes of Moderate Recreation Daily", 
                                                         'PAD680': "Minutes of Sedentary Activity Daily",
                                                         'SLQ300': "Bedtime Week/Workdays",
                                                         'SLQ310': "Waketime Week/Workdays", 
                                                         'SLD012': "Sleep Hours Week/Workdays", 
                                                         'SLQ320': "Bedtime Weekends", 
                                                         'SLQ330': "Waketime Weekends", 
                                                         'SLD013': "Sleep Hours Weekends",
                                                         'SLQ030': "Snoring Frequency Past Year", 
                                                         'SLQ040': "Frequency Snort/Stop Breathing Past Year", 
                                                         'SLQ050': "Told Doctor About Sleep Issues", 
                                                         'SLQ120': "Feeling Tired During Day Frequency"

})

Distilled_Data_Renamed.head()

Unnamed: 0,Respondent ID,Taken Rx Past Month,Number Days Taken Rx,Number of Rx Meds,Gender,Age at Screening,Race,Educational Attainment,Marital Status,Sample Interview Weight,...,Bedtime Week/Workdays,Waketime Week/Workdays,Sleep Hours Week/Workdays,Bedtime Weekends,Waketime Weekends,Sleep Hours Weekends,Snoring Frequency Past Year,Frequency Snort/Stop Breathing Past Year,Told Doctor About Sleep Issues,Feeling Tired During Day Frequency
0,109263.0,2.0,,,1.0,2.0,6.0,,,7891.762435,...,,,,,,,,,,
1,109264.0,2.0,,,2.0,13.0,1.0,,,11689.747264,...,,,,,,,,,,
2,109265.0,2.0,,,1.0,2.0,3.0,,,16273.825939,...,,,,,,,,,,
3,109266.0,2.0,,,2.0,29.0,6.0,5.0,3.0,7825.646112,...,b'22:00',b'05:30',7.5,b'23:00',b'07:00',8.0,1.0,5.397605e-79,2.0,5.397605e-79
4,109267.0,1.0,,1.0,2.0,21.0,2.0,4.0,3.0,26379.991724,...,b'00:00',b'08:00',8.0,b'03:00',b'11:00',8.0,5.397605e-79,5.397605e-79,2.0,2.0


In [12]:
# Overview of df and variables:
Distilled_Data_Renamed.describe()
Distilled_Data_Renamed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32962 entries, 0 to 32961
Data columns (total 38 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Respondent ID                             32962 non-null  float64
 1   Taken Rx Past Month                       32962 non-null  float64
 2   Number Days Taken Rx                      24031 non-null  float64
 3   Number of Rx Meds                         24246 non-null  float64
 4   Gender                                    32962 non-null  float64
 5   Age at Screening                          32962 non-null  float64
 6   Race                                      32962 non-null  float64
 7   Educational Attainment                    25746 non-null  float64
 8   Marital Status                            25746 non-null  float64
 9   Sample Interview Weight                   32962 non-null  float64
 10  MEC Sample Weight                 

## Exporting Dataframe as CSV

In [10]:
Distilled_Data_Renamed.to_csv("Datasets/Filtered_Renamed_Data",
                  encoding="utf-8", index=False, header=True)