In [15]:
import pandas as pd
import numpy as np
import seaborn as sns

df = pd.read_csv('COVID-19 Survey.csv', parse_dates=[0])

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113 entries, 0 to 112
Data columns (total 29 columns):
Timestamp                                                                                                                                                                                                                                                              113 non-null datetime64[ns, pytz.FixedOffset(-540)]
What is your gender?                                                                                                                                                                                                                                                   112 non-null object
What is your age?                                                                                                                                                                                                                                                      104 non-null float64
Current count

In [16]:
# Ok, we have our base dataframe to work from.  But those column names are unwieldy.
# Let's change the columns to just numbers and store the questions in a dictionary for later recall.

#    Make a list of numbers to match columns
colnum = np.arange(1, len(df.columns)+1)

#    Store questions in a list
questions = []
for x in df.columns:
    questions.append(x)

#    Zip colnum and questions into a dictionary for easier recall
question_dict = dict(zip(colnum, questions))

#    Replace column names with numbers
df.columns = colnum

# Now we have something more manageable for analysis!

In [17]:
# Let's address missing data next.
print(df.info())

# Before we can even address dropping NaNs or some other option, we need to look at question 29.
# Question 29 was a feedback question and not directly part of the survey.
# I left the responses in my raw data in case anyone was interested in seeing it.
# Some of the feedback was constructive criticism on ways I could have improved the survey,
# so I thought it valid to leave if anyone else wanted to check out.

# We could just drop question 29, but for curiosity sake, I want to check if there was any
# trend in the people who left feedback vs. those that didn't.  So let's change it to a boolean instead.

#    For now, let's change 29 to be a boolean about whether or not they left a response
df[29] = ~df[29].isnull()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113 entries, 0 to 112
Data columns (total 29 columns):
1     113 non-null datetime64[ns, pytz.FixedOffset(-540)]
2     112 non-null object
3     104 non-null float64
4     111 non-null object
5     112 non-null float64
6     112 non-null float64
7     112 non-null float64
8     110 non-null object
9     111 non-null object
10    110 non-null object
11    111 non-null object
12    112 non-null object
13    112 non-null object
14    111 non-null object
15    108 non-null object
16    111 non-null object
17    112 non-null object
18    112 non-null object
19    112 non-null object
20    112 non-null object
21    111 non-null object
22    110 non-null object
23    109 non-null object
24    112 non-null object
25    111 non-null float64
26    108 non-null object
27    107 non-null object
28    109 non-null object
29    20 non-null object
dtypes: datetime64[ns, pytz.FixedOffset(-540)](1), float64(5), object(23)
memory usage: 25.7+ KB
None
<cl

In [26]:
# OK, now our lowest single column for non-null is 104, that's not bad when I made every question
# optional and we have 113 total responses.  But how many overlap?  If we drop all NaN will it reduce a lot?

dropped = df.dropna()
print(dropped.info())

# This takes us down to 92 responses with every column filled out.  That's more of a drop than I'd like.
# Let's examine ways to improve this number column by column first.

# let's look at just the rows with missing data first to see what we are working with
#  make a boolean of missing values, and then make an index with .any(axis=1)
df_null = df.isnull()
null_index = df_null.any(axis=1)
print(df[null_index])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92 entries, 0 to 112
Data columns (total 29 columns):
1     92 non-null datetime64[ns, pytz.FixedOffset(-540)]
2     92 non-null object
3     92 non-null float64
4     92 non-null object
5     92 non-null float64
6     92 non-null float64
7     92 non-null float64
8     92 non-null object
9     92 non-null object
10    92 non-null object
11    92 non-null object
12    92 non-null object
13    92 non-null object
14    92 non-null object
15    92 non-null object
16    92 non-null object
17    92 non-null object
18    92 non-null object
19    92 non-null object
20    92 non-null object
21    92 non-null object
22    92 non-null object
23    92 non-null object
24    92 non-null object
25    92 non-null float64
26    92 non-null object
27    92 non-null object
28    92 non-null object
29    92 non-null bool
dtypes: bool(1), datetime64[ns, pytz.FixedOffset(-540)](1), float64(5), object(22)
memory usage: 20.9+ KB
None
                         

In [None]:
# Number 55 stands out right away, they didn't answer a single question, just submitted a blank survey.
# Let's drop 55 entirely.
df.drop(55)

# That brings a lot of our columns up to 112/112 responses now!  We can focus on the few columns still missing data.
