# First Name: Anh Duc
# Last Name: Dang

In [None]:
import pandas as pd
import numpy as np

In [None]:
nesarc = pd.read_csv('nesarc.csv', low_memory=False)
pd.set_option('display.float_format', lambda x:'%f'%x)

# Coding valid data

In [None]:
nesarc['CHECK321'] = pd.to_numeric(nesarc['CHECK321'], errors='coerce') #convert smoking status to numeric
nesarc['CHECK321'].head(25) #print the first 25

In [None]:
nesarc['CHECK321'].fillna(11, inplace=True) #fill in nan value with 11
nesarc['CHECK321'].head(25)

# Managing missing data

In [None]:
nesarc['S3AQ3B1'] = pd.to_numeric(nesarc['S3AQ3B1'],errors='coerce') #convert variable to numeric
nesarc['S3AQ3B1'].head()
#len(nesarc)

In [None]:
#subset data to young adults age 18 to 25 who have smoked in the past 12 months
sub1=nesarc[(nesarc['AGE']>=18) & (nesarc['AGE']<=25) & (nesarc['CHECK321']==1)]
sub1.head()

In [None]:
sub2 = sub1.copy()
sub2.head()
len(sub2)

In [None]:
c_cig_feq = sub2['S3AQ3B1'].value_counts(sort=False, dropna=False)
print ('counts for original S3AQ3B1')
print(c_cig_feq)

In [None]:
sub2['S3AQ3B1']=sub2['S3AQ3B1'].replace(9, np.nan)

In [None]:
c_cig_feq_nan = sub2['S3AQ3B1'].value_counts(sort=False)
print ('counts for S3AQ3B1 with 9 set to NAN and number of missing requested')
print(c_cig_feq_nan)

In [None]:
c_cig_quan = sub2['S3AQ3C1'].value_counts(sort=False,dropna=False)
print ('counts for S3AQ3C1') 
print(c_cig_quan)

In [None]:
sub2['S3AQ3C1']=sub2['S3AQ3C1'].replace(99, np.nan)
c_cig_quan_nan = sub2['S3AQ3C1'].value_counts(sort=False)
print ('counts for S3AQ3C1 with 99 set to NAN')
print(c_cig_quan_nan)

# Recoding values

In [None]:
c_cig_feq = sub2['S3AQ3B1'].value_counts(sort=False) #get count in each category
print ('counts for S3AQ3B1')
print(c_cig_feq)

In [None]:
recode1 = {1: 6, 2: 5, 3: 4, 4: 3, 5: 2, 6: 1} #recoding so that higher numbers mean more smoking frequency
sub2['USFREQ'] = sub2['S3AQ3B1'].map(recode1)

recode_cig_feq = sub2['USFREQ'].value_counts(sort=False) #get count in each category
print ('counts for USFREQ')
print(recode_cig_feq)

In [None]:
#recoding values for S3AQ3B1 into a new variable, USFREQMO
recode2 = {1:30, 2:22, 3:14, 4:5, 5:2.5, 6:1} #recode to quantitative variable
sub2['USFREQMO']= sub2['S3AQ3B1'].map(recode2)

recode_cig_feq_m = sub2['USFREQMO'].value_counts(sort=False) #get count in each category
print ('counts for USFREQMO')
print(recode_cig_feq_m)

# Creating secondary variable

In [None]:
#secondary variable multiplying the number of days smoked/month and the approx number of cig smoked/day
sub2['NUMCIGMO_EST']=sub2['USFREQMO'] * sub2['S3AQ3C1'] #get the number of cigarettes smoked per month
sub2['NUMCIGMO_EST'].head()

# Grouping values within individual variables

In [None]:
#examining frequency distributions for age
c_age = sub2['AGE'].value_counts(sort=False)
print ('counts for AGE')
print(c_age)

In [None]:
p_age = sub2['AGE'].value_counts(sort=False, normalize=True)
print ('percentages for AGE')
print (p_age)

In [None]:
# categorize quantitative variable based on customized splits using cut function
# splits into 3 groups (18-20, 21-22, 23-25) - remember that Python starts counting from 0, not 1
sub2['AGEGROUP3'] = pd.cut(sub2.AGE, [17, 20, 22, 25])
c_age_group = sub2['AGEGROUP3'].value_counts(sort=False, dropna=True)
print('counts for AGEGROUP3')
print(c_age_group)

print('percentages for AGEGROUP3')
p_age_group = sub2['AGEGROUP3'].value_counts(sort=False, normalize=True)
print(p_age_group)

In [None]:
#crosstabs evaluating which ages were put into which AGEGROUP3
print (pd.crosstab(sub2['AGEGROUP3'], sub2['AGE']))