In [1]:
#Import the required packages
#Import package pandas for data analysis
import pandas as pd

# Import package numpy for numeric computing
import numpy as np

# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages

import scipy.stats as ss
import seaborn as sn


import matplotlib.patches as mpatches




In [2]:
# Reading from a csv file, into a data frame
# setting parameters to remove white space from the beginning and end of column names
df = pd.read_csv('covid19-cdc-20204883.csv', keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)

In [3]:
#Checking how many rows and columns the dataset has
df.shape

(10000, 12)

In [4]:
#printing first 5 rows in the dataset
df.head(5)

Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,2020/12/30,2020/12/30,2021/01/02,2020/12/30,Laboratory-confirmed case,Male,40 - 49 Years,"White, Non-Hispanic",No,No,No,Yes
1,2020/12/21,2020/12/29,2020/12/22,2020/12/21,Laboratory-confirmed case,Male,60 - 69 Years,"White, Non-Hispanic",No,No,No,No
2,2020/11/20,2020/11/26,,,Probable Case,Female,30 - 39 Years,Unknown,No,Unknown,No,Missing
3,2020/12/17,,,,Laboratory-confirmed case,Male,10 - 19 Years,Unknown,Unknown,Missing,No,Missing
4,2020/04/24,2020/05/04,,2020/04/24,Laboratory-confirmed case,Female,50 - 59 Years,"White, Non-Hispanic",No,Unknown,No,Yes


In [5]:
#checking datatypes of the features
df.dtypes

cdc_case_earliest_dt       object
cdc_report_dt              object
pos_spec_dt                object
onset_dt                   object
current_status             object
sex                        object
age_group                  object
race_ethnicity_combined    object
hosp_yn                    object
icu_yn                     object
death_yn                   object
medcond_yn                 object
dtype: object

In [6]:
#changing the categorical columns to category type
df['current_status'] = df['current_status'].astype('category')
df['sex'] = df['sex'].astype('category')
df['age_group'] = df['age_group'].astype('category')
df['race_ethnicity_combined'] = df['race_ethnicity_combined'].astype('category')
df['hosp_yn'] = df['hosp_yn'].astype('category')
df['icu_yn'] = df['icu_yn'].astype('category')
df['death_yn'] = df['death_yn'].astype('category')
df['medcond_yn'] = df['medcond_yn'].astype('category')

In [7]:
#changing the slash so I can convert the format of the columns to datetime
df['cdc_case_earliest_dt'] = df['cdc_case_earliest_dt'].str.replace('/', '')
df['cdc_report_dt'] = df['cdc_report_dt'].str.replace('/', '')
df['pos_spec_dt'] = df['pos_spec_dt'].str.replace('/', '')
df['onset_dt'] = df['onset_dt'].str.replace('/', '')

In [8]:
#changing the dates columns format to datetime 
df['cdc_case_earliest_dt'] = pd.to_datetime(df['cdc_case_earliest_dt'], format='%Y%m%d')
df['cdc_report_dt'] = pd.to_datetime(df['cdc_report_dt'], format='%Y%m%d')
df['pos_spec_dt'] = pd.to_datetime(df['pos_spec_dt'], format='%Y%m%d')
df['onset_dt'] = pd.to_datetime(df['onset_dt'], format='%Y%m%d')

In [9]:
#checking how many duplicate rows there is in the dataaset.
print('Number of duplicate (excluding first) rows in the table is: ', df.duplicated().sum())

Number of duplicate (excluding first) rows in the table is:  438


In [10]:
#Print the number of duplicates, without the original rows that were duplicated
print('Number of duplicate (excluding first) rows in the table is: ', df.duplicated().sum())

# Check for duplicate rows. 
# Use "keep=False" to mark all duplicates as true, including the original rows that were duplicated.
print('Number of duplicate rows (including first) in the table is:', df[df.duplicated(keep=False)].shape[0])

Number of duplicate (excluding first) rows in the table is:  438
Number of duplicate rows (including first) in the table is: 790


In [11]:
#Look at the categorical features only
category_columns = df.select_dtypes(['category']).columns
df[category_columns].head()

Unnamed: 0,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,Laboratory-confirmed case,Male,40 - 49 Years,"White, Non-Hispanic",No,No,No,Yes
1,Laboratory-confirmed case,Male,60 - 69 Years,"White, Non-Hispanic",No,No,No,No
2,Probable Case,Female,30 - 39 Years,Unknown,No,Unknown,No,Missing
3,Laboratory-confirmed case,Male,10 - 19 Years,Unknown,Unknown,Missing,No,Missing
4,Laboratory-confirmed case,Female,50 - 59 Years,"White, Non-Hispanic",No,Unknown,No,Yes


In [12]:
#Printing table with categorical statistics
df[category_columns].describe().T

Unnamed: 0,count,unique,top,freq
current_status,10000,2,Laboratory-confirmed case,9307
sex,10000,4,Female,5213
age_group,10000,10,20 - 29 Years,1846
race_ethnicity_combined,10000,9,Unknown,4030
hosp_yn,10000,4,No,5221
icu_yn,10000,4,Missing,7637
death_yn,10000,2,No,9684
medcond_yn,10000,4,Missing,7431


In [13]:
# Look once again at the summary stats table for categorical features
# Care is needed with .describe() for categorical features, it excludes the NaN values for features 
df_table_categ = df[category_columns].describe().T
df_table_categ

Unnamed: 0,count,unique,top,freq
current_status,10000,2,Laboratory-confirmed case,9307
sex,10000,4,Female,5213
age_group,10000,10,20 - 29 Years,1846
race_ethnicity_combined,10000,9,Unknown,4030
hosp_yn,10000,4,No,5221
icu_yn,10000,4,Missing,7637
death_yn,10000,2,No,9684
medcond_yn,10000,4,Missing,7431


In [14]:
# store the 2ndmode and 2ndmode % values in a new dataframe
#df_category_mode = pd.DataFrame(index = df_table_categ.index, columns=['mode', 'freq_mode','%mode', '2ndmode', 'freq_2ndmode','%2ndmode'])
df_category_mode = pd.DataFrame(index = df_table_categ.index, columns=['mode','%mode', '2ndmode','%2ndmode'])
df_category_mode

Unnamed: 0,mode,%mode,2ndmode,%2ndmode
current_status,,,,
sex,,,,
age_group,,,,
race_ethnicity_combined,,,,
hosp_yn,,,,
icu_yn,,,,
death_yn,,,,
medcond_yn,,,,


In [15]:
# Look at the values taken by each categorical feature
for column in category_columns:
    print("\n" + column)
    print(df[column].value_counts())
    print(df[column].value_counts().keys())
    
    df_category_mode.loc[column]['mode'] = df[column].value_counts().keys()[0]
    #df_category_mode.loc[column]['freq_mode'] = df[column].value_counts()[df_category_mode.loc[column]['mode']]
    df_category_mode.loc[column]['%mode'] = df[column].value_counts(normalize=True)[df_category_mode.loc[column]['mode']]
 

    if df[column].value_counts().size > 1:
        df_category_mode.loc[column]['2ndmode'] = df[column].value_counts().keys()[1]
        #df_category_mode.loc[column]['freq_2ndmode'] = df[column].value_counts()[df_category_mode.loc[column]['2ndmode']]
        df_category_mode.loc[column]['%2ndmode'] = df[column].value_counts(normalize=True)[df_category_mode.loc[column]['2ndmode']]
    else: df_category_mode.loc[column] = '-'


current_status
Laboratory-confirmed case    9307
Probable Case                 693
Name: current_status, dtype: int64
CategoricalIndex(['Laboratory-confirmed case', 'Probable Case'], categories=['Laboratory-confirmed case', 'Probable Case'], ordered=False, dtype='category')

sex
Female     5213
Male       4706
Unknown      66
Missing      15
Name: sex, dtype: int64
CategoricalIndex(['Female', 'Male', 'Unknown', 'Missing'], categories=['Female', 'Male', 'Missing', 'Unknown'], ordered=False, dtype='category')

age_group
20 - 29 Years    1846
30 - 39 Years    1650
40 - 49 Years    1469
50 - 59 Years    1400
60 - 69 Years    1072
10 - 19 Years    1055
70 - 79 Years     580
80+ Years         477
0 - 9 Years       440
Missing            11
Name: age_group, dtype: int64
CategoricalIndex(['20 - 29 Years', '30 - 39 Years', '40 - 49 Years',
                  '50 - 59 Years', '60 - 69 Years', '10 - 19 Years',
                  '70 - 79 Years', '80+ Years', '0 - 9 Years', 'Missing'],
            

In [16]:
df_category_mode

Unnamed: 0,mode,%mode,2ndmode,%2ndmode
current_status,Laboratory-confirmed case,0.9307,Probable Case,0.0693
sex,Female,0.5213,Male,0.4706
age_group,20 - 29 Years,0.1846,30 - 39 Years,0.165
race_ethnicity_combined,Unknown,0.403,"White, Non-Hispanic",0.3344
hosp_yn,No,0.5221,Missing,0.2333
icu_yn,Missing,0.7637,Unknown,0.13
death_yn,No,0.9684,Yes,0.0316
medcond_yn,Missing,0.7431,No,0.0911


In [17]:
# Prepare %missing column
category_columns_perc_missing  = 100 * (df[category_columns] == 'Missing').sum()/df.shape[0] #(df[category_columns].isnull().sum()/df.shape[0])
#category_columns_perc_missing
category_columns_perc_missing
# Store the values in a dataframe
df_category_perc_missing = pd.DataFrame(category_columns_perc_missing, columns=['%missing'])
df_category_perc_missing

Unnamed: 0,%missing
current_status,0.0
sex,0.15
age_group,0.11
race_ethnicity_combined,0.99
hosp_yn,23.33
icu_yn,76.37
death_yn,0.0
medcond_yn,74.31


In [18]:
# Add cardinality column to the categorical features
# A simpler way to get the number of unique values per feature
categorical_columns_card = df[category_columns].nunique()
# store the values in a dataframe
df_categorical_card = pd.DataFrame(categorical_columns_card, columns=['card'])
df_categorical_card

Unnamed: 0,card
current_status,2
sex,4
age_group,10
race_ethnicity_combined,9
hosp_yn,4
icu_yn,4
death_yn,2
medcond_yn,4


In [19]:
# Prepare %Unknown column
category_columns_perc_unknown  = 100 * (df[category_columns] == 'Unknown').sum()/df.shape[0] 
category_columns_perc_unknown
# Store the values in a dataframe
df_category_perc_unknown = pd.DataFrame(category_columns_perc_unknown, columns=['%unknown'])
df_category_perc_unknown

Unnamed: 0,%unknown
current_status,0.0
sex,0.66
age_group,0.0
race_ethnicity_combined,40.3
hosp_yn,17.92
icu_yn,13.0
death_yn,0.0
medcond_yn,7.7


In [20]:
# Put the columns together to prepare the final table 
df_category_columns_data_quality_report_table = pd.concat([df_table_categ, df_category_mode, df_category_perc_missing, df_category_perc_unknown, df_categorical_card], axis=1)

# Print data quality report table for categorical features to a file.
df_category_columns_data_quality_report_table.to_csv("Homework-DataQualityReport-CategoricalFeatures-Table.csv", 
                                       index_label='Feature')

df_category_columns_data_quality_report_table

Unnamed: 0,count,unique,top,freq,mode,%mode,2ndmode,%2ndmode,%missing,%unknown,card
current_status,10000,2,Laboratory-confirmed case,9307,Laboratory-confirmed case,0.9307,Probable Case,0.0693,0.0,0.0,2
sex,10000,4,Female,5213,Female,0.5213,Male,0.4706,0.15,0.66,4
age_group,10000,10,20 - 29 Years,1846,20 - 29 Years,0.1846,30 - 39 Years,0.165,0.11,0.0,10
race_ethnicity_combined,10000,9,Unknown,4030,Unknown,0.403,"White, Non-Hispanic",0.3344,0.99,40.3,9
hosp_yn,10000,4,No,5221,No,0.5221,Missing,0.2333,23.33,17.92,4
icu_yn,10000,4,Missing,7637,Missing,0.7637,Unknown,0.13,76.37,13.0,4
death_yn,10000,2,No,9684,No,0.9684,Yes,0.0316,0.0,0.0,2
medcond_yn,10000,4,Missing,7431,Missing,0.7431,No,0.0911,74.31,7.7,4


In [21]:
#Keep only the numeric features.
numeric_columns = df.select_dtypes(['datetime64', 'timedelta64']).columns
numeric_columns

Index(['cdc_case_earliest_dt', 'cdc_report_dt', 'pos_spec_dt', 'onset_dt'], dtype='object')

In [22]:
# Select only the numeric features
df[numeric_columns].head()

Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt
0,2020-12-30,2020-12-30,2021-01-02,2020-12-30
1,2020-12-21,2020-12-29,2020-12-22,2020-12-21
2,2020-11-20,2020-11-26,NaT,NaT
3,2020-12-17,NaT,NaT,NaT
4,2020-04-24,2020-05-04,NaT,2020-04-24


In [23]:
#Missing data all features
df[numeric_columns].isnull().sum()

cdc_case_earliest_dt       0
cdc_report_dt           2327
pos_spec_dt             7163
onset_dt                4936
dtype: int64

In [24]:
# df.count() gives the number of rows with non-NAN feature value
# Instead of df.isnull().sum() we can use the functions df.shape and df.count()
# numeric_columns_missing = df.shape[0] - df[numeric_columns].count()

# Prepare %missing column
numeric_columns_missing  = 100 * (df[numeric_columns].isnull().sum()/df.shape[0])
# store the values in a dataframe
df_numeric_missing = pd.DataFrame(numeric_columns_missing, columns=['%missing'])
df_numeric_missing

Unnamed: 0,%missing
cdc_case_earliest_dt,0.0
cdc_report_dt,23.27
pos_spec_dt,71.63
onset_dt,49.36


In [25]:
# Add cardinality column
# A simpler way to get the number of unique values per feature
numeric_columns_card = df[numeric_columns].nunique()
# store the values in a dataframe
df_numeric_card = pd.DataFrame(numeric_columns_card, columns=['card'])
df_numeric_card

Unnamed: 0,card
cdc_case_earliest_dt,319
cdc_report_dt,322
pos_spec_dt,313
onset_dt,322


In [26]:
# Add missing values column
df_table_numeric = df[numeric_columns].describe(datetime_is_numeric=True).T
#final table or timedelta
#df_table_numeric_final = df[numeric_columns_final].describe(timedelta_is_numeric=True).T

# Put the columns together to prepare the final table for numeric_columns
df_numeric_columns_data_quality_report_table = pd.concat([df_table_numeric, df_numeric_missing, df_numeric_card], axis=1)

# Print data quality report table for numeric features to a file.
df_numeric_columns_data_quality_report_table.to_csv("Homework-DataQualityReport-NumericFeatures-Table.csv", 
                                        index_label='Feature')
df_numeric_columns_data_quality_report_table

Unnamed: 0,count,mean,min,25%,50%,75%,max,%missing,card
cdc_case_earliest_dt,10000,2020-10-04 16:25:58.080000000,2020-01-01,2020-07-25,2020-11-07,2020-12-15,2021-01-16,0.0,319
cdc_report_dt,7673,2020-10-16 05:30:29.323602176,2020-03-04,2020-08-14,2020-11-11,2020-12-21,2021-01-29,23.27,322
pos_spec_dt,2837,2020-09-17 07:27:10.595699712,2020-03-13,2020-07-03,2020-10-17,2020-12-04,2021-01-24,71.63,313
onset_dt,5064,2020-09-22 05:01:08.246445568,2020-01-01,2020-07-15,2020-10-21,2020-12-03,2021-01-27,49.36,322


In [27]:
# Keep a copy of the original data
df_raw = df.copy()
df_raw.columns

Index(['cdc_case_earliest_dt', 'cdc_report_dt', 'pos_spec_dt', 'onset_dt',
       'current_status', 'sex', 'age_group', 'race_ethnicity_combined',
       'hosp_yn', 'icu_yn', 'death_yn', 'medcond_yn'],
      dtype='object')

In [28]:
# Dropping the intended columns 
columns_to_drop = ['cdc_report_dt', 'pos_spec_dt', 'onset_dt']
for column in columns_to_drop:
    df_raw = df_raw.drop(column, 1)

In [29]:
df_raw

Unnamed: 0,cdc_case_earliest_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,2020-12-30,Laboratory-confirmed case,Male,40 - 49 Years,"White, Non-Hispanic",No,No,No,Yes
1,2020-12-21,Laboratory-confirmed case,Male,60 - 69 Years,"White, Non-Hispanic",No,No,No,No
2,2020-11-20,Probable Case,Female,30 - 39 Years,Unknown,No,Unknown,No,Missing
3,2020-12-17,Laboratory-confirmed case,Male,10 - 19 Years,Unknown,Unknown,Missing,No,Missing
4,2020-04-24,Laboratory-confirmed case,Female,50 - 59 Years,"White, Non-Hispanic",No,Unknown,No,Yes
...,...,...,...,...,...,...,...,...,...
9995,2020-11-01,Laboratory-confirmed case,Male,30 - 39 Years,"White, Non-Hispanic",No,Unknown,No,No
9996,2020-10-04,Laboratory-confirmed case,Male,60 - 69 Years,"White, Non-Hispanic",No,No,No,Yes
9997,2020-05-26,Laboratory-confirmed case,Male,30 - 39 Years,Unknown,Unknown,Missing,No,Missing
9998,2020-08-28,Probable Case,Female,30 - 39 Years,"Black, Non-Hispanic",No,Missing,No,Missing


**ICU_YN**

In [30]:
df_raw['icu_yn'].isnull().sum()

0

In [31]:
(df_raw['icu_yn'] == 'Missing').sum()

7637

In [32]:
df_raw['icu_yn'] = df_raw['icu_yn'].replace({'Missing' : 'Unknown'})

In [33]:
(df_raw['icu_yn'] == 'Missing').sum()

0

**Race_ethnicity_combined**

In [34]:
#df_raw['race_ethnicity_combined'] = df_raw['race_ethnicity_combined'].replace('Missing', 'Unknown' )
df_raw['race_ethnicity_combined'] = df_raw['race_ethnicity_combined'].replace({'Missing': 'Unknown'})

In [35]:
(df_raw['race_ethnicity_combined'] == 'Missing').sum()

0

In [36]:
df_raw['race_ethnicity_combined'].values

['White, Non-Hispanic', 'White, Non-Hispanic', 'Unknown', 'Unknown', 'White, Non-Hispanic', ..., 'White, Non-Hispanic', 'White, Non-Hispanic', 'Unknown', 'Black, Non-Hispanic', 'White, Non-Hispanic']
Length: 10000
Categories (8, object): ['American Indian/Alaska Native, Non-Hispanic', 'Asian, Non-Hispanic', 'Black, Non-Hispanic', 'Hispanic/Latino', 'Multiple/Other, Non-Hispanic', 'Native Hawaiian/Other Pacific Islander, Non-H..., 'Unknown', 'White, Non-Hispanic']

**Med_condition**

In [37]:
df_raw['medcond_yn'].isnull().sum()

0

In [38]:
(df_raw['medcond_yn'] == 'Missing').sum()

7431

In [39]:
df_raw['medcond_yn'] = df_raw['medcond_yn'].replace({'Missing' : 'Unknown'})

In [40]:
df_raw['medcond_yn'].values

['Yes', 'No', 'Unknown', 'Unknown', 'Yes', ..., 'No', 'Yes', 'Unknown', 'Unknown', 'Unknown']
Length: 10000
Categories (3, object): ['No', 'Unknown', 'Yes']

**HOSP_YN**

In [41]:
df_raw['hosp_yn'].isnull().sum()

0

In [42]:
(df_raw['hosp_yn'] == 'Missing').sum()

2333

In [43]:
df_raw['hosp_yn'] = df_raw['hosp_yn'].replace({'Missing' : 'Unknown'})

In [44]:
df_raw['medcond_yn'].values

['Yes', 'No', 'Unknown', 'Unknown', 'Yes', ..., 'No', 'Yes', 'Unknown', 'Unknown', 'Unknown']
Length: 10000
Categories (3, object): ['No', 'Unknown', 'Yes']

**AGE_GROUP**

In [45]:
(df_raw['age_group'] == 'Missing').sum()

11

In [46]:
(df_raw['age_group'] == 'Unknown').sum()

0

In [47]:
#df_raw['age_group'] = (df_raw['age_group'] == 'Missing').drop
df_raw = df_raw.drop(df_raw[df_raw["age_group"] == 'Missing'].index)

In [48]:
(df_raw['age_group'] == 'Missing').sum()

0

**SEX**

In [49]:
(df_raw['sex'] == 'Missing').sum()

15

In [50]:
#df_raw['sex'] = (df_raw['sex'] == 'Missing').drop
df_raw = df_raw.drop(df_raw[df_raw["sex"] == 'Missing'].index)

In [51]:
(df_raw['sex'] == 'Missing').sum()

0

In [52]:
df_raw

Unnamed: 0,cdc_case_earliest_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,2020-12-30,Laboratory-confirmed case,Male,40 - 49 Years,"White, Non-Hispanic",No,No,No,Yes
1,2020-12-21,Laboratory-confirmed case,Male,60 - 69 Years,"White, Non-Hispanic",No,No,No,No
2,2020-11-20,Probable Case,Female,30 - 39 Years,Unknown,No,Unknown,No,Unknown
3,2020-12-17,Laboratory-confirmed case,Male,10 - 19 Years,Unknown,Unknown,Unknown,No,Unknown
4,2020-04-24,Laboratory-confirmed case,Female,50 - 59 Years,"White, Non-Hispanic",No,Unknown,No,Yes
...,...,...,...,...,...,...,...,...,...
9995,2020-11-01,Laboratory-confirmed case,Male,30 - 39 Years,"White, Non-Hispanic",No,Unknown,No,No
9996,2020-10-04,Laboratory-confirmed case,Male,60 - 69 Years,"White, Non-Hispanic",No,No,No,Yes
9997,2020-05-26,Laboratory-confirmed case,Male,30 - 39 Years,Unknown,Unknown,Unknown,No,Unknown
9998,2020-08-28,Probable Case,Female,30 - 39 Years,"Black, Non-Hispanic",No,Unknown,No,Unknown


In [53]:
(df_raw['sex'] == 'Missing').sum()

0

In [54]:
df_raw.isnull().sum()

cdc_case_earliest_dt       0
current_status             0
sex                        0
age_group                  0
race_ethnicity_combined    0
hosp_yn                    0
icu_yn                     0
death_yn                   0
medcond_yn                 0
dtype: int64

In [55]:
df_raw

Unnamed: 0,cdc_case_earliest_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,2020-12-30,Laboratory-confirmed case,Male,40 - 49 Years,"White, Non-Hispanic",No,No,No,Yes
1,2020-12-21,Laboratory-confirmed case,Male,60 - 69 Years,"White, Non-Hispanic",No,No,No,No
2,2020-11-20,Probable Case,Female,30 - 39 Years,Unknown,No,Unknown,No,Unknown
3,2020-12-17,Laboratory-confirmed case,Male,10 - 19 Years,Unknown,Unknown,Unknown,No,Unknown
4,2020-04-24,Laboratory-confirmed case,Female,50 - 59 Years,"White, Non-Hispanic",No,Unknown,No,Yes
...,...,...,...,...,...,...,...,...,...
9995,2020-11-01,Laboratory-confirmed case,Male,30 - 39 Years,"White, Non-Hispanic",No,Unknown,No,No
9996,2020-10-04,Laboratory-confirmed case,Male,60 - 69 Years,"White, Non-Hispanic",No,No,No,Yes
9997,2020-05-26,Laboratory-confirmed case,Male,30 - 39 Years,Unknown,Unknown,Unknown,No,Unknown
9998,2020-08-28,Probable Case,Female,30 - 39 Years,"Black, Non-Hispanic",No,Unknown,No,Unknown


In [56]:
# Write the cleaned dataframe to a csv file
df_raw.to_csv('Homework_dataset_cleaned.csv', index=False)