# First Name: Christopher
# Last Name: Caferra

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
nesarc = pd.read_csv('nesarc.csv', low_memory=False)
pd.set_option('display.float_format', lambda x:'%f'%x)

# From Prac 1
# Columns/Data used in Prac 1

In [3]:
nesarc['S2AQ5B'] = pd.to_numeric(nesarc['S2AQ5B'], errors='coerce') #convert variable to numeric
nesarc['S2AQ5D'] = pd.to_numeric(nesarc['S2AQ5D'], errors='coerce') #convert variable to numeric
nesarc['S2AQ5A'] = pd.to_numeric(nesarc['S2AQ5A'], errors='coerce') #convert variable to numeric
nesarc['S2BQ1B1'] = pd.to_numeric(nesarc['S2BQ1B1'], errors='coerce') #convert variable to numeric
nesarc['AGE'] = pd.to_numeric(nesarc['AGE'], errors='coerce') #convert variable to numeric

# From Prac 2
# A subset of nesarc data, with the following criteria
# Age from 26 to 50
# Beer drinking status - S2AQ5A = Y

In [4]:
sub1=nesarc[(nesarc['AGE']>=26) & (nesarc['AGE']<=50) & (nesarc['S2AQ5A']==1)]
sub2=sub1.copy()

# From Prac 2
# SETTING MISSING DATA

In [None]:
sub2['S2AQ5D']=sub2['S2AQ5D'].replace(99, np.nan)

sub2['S2AQ5B']=sub2['S2AQ5B'].replace(8, np.nan)
sub2['S2AQ5B']=sub2['S2AQ5B'].replace(9, np.nan)
sub2['S2AQ5B']=sub2['S2AQ5B'].replace(10, np.nan)
sub2['S2AQ5B']=sub2['S2AQ5B'].replace(99, np.nan)

sub2['S2BQ1B1']=sub2['S2BQ1B1'].replace(9, np.nan)

# From Prac 2
# Recode data

In [5]:
recode2 = {1:30, 2:26, 3:14, 4:8, 5:4, 6:2.5, 7:1}
sub2['BEER_FEQMO']= sub2['S2AQ5B'].map(recode2)

recode3 = {2:0, 1:1}
sub2['S2BQ1B1']= sub2['S2BQ1B1'].map(recode3)

# From Prac 2
# Create secondary variables

In [6]:
# A secondary variable multiplying the number of days beer consumed/month and the approx number of 
# beer consumed/day
sub2['NUMBEERMO_EST']=sub2['BEER_FEQMO'] * sub2['S2AQ5D']

# Draw a Line chart
# Age vs Number of beer consumed per month (NUMBEERMO_EST)

# a) mean number of beer consumed
# var = mean number of beers consumed a month, grouped by age

In [7]:
var = sub2.groupby(['AGE']).NUMBEERMO_EST.mean()
print(var)

AGE
26   23.701357
27   23.854545
28   30.035270
29   23.994949
30   25.397690
31   25.763636
32   25.678994
33   24.761017
34   24.256716
35   28.668478
36   26.813272
37   29.542700
38   26.414773
39   28.397929
40   35.407932
41   23.233788
42   32.082317
43   38.296167
44   37.279762
45   38.409247
46   31.727799
47   33.473469
48   33.655303
49   28.177778
50   28.995614
Name: NUMBEERMO_EST, dtype: float64


In [8]:
%matplotlib notebook
#code for line chart
var.plot(kind='line')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1a193e2710>

# b) total number of beer consumed
# var2 = sum number of beers consumed a month, grouped by age

In [9]:
var2 = sub2.groupby(['AGE']).NUMBEERMO_EST.sum()
print(var2)

AGE
26    5238.000000
27    6560.000000
28    7238.500000
29    7126.500000
30    7695.500000
31    8502.000000
32    8679.500000
33    7304.500000
34    8126.000000
35    7912.500000
36    8687.500000
37   10724.000000
38    9298.000000
39    9598.500000
40   12499.000000
41    6807.500000
42   10523.000000
43   10991.000000
44    9394.500000
45   11215.500000
46    8217.500000
47    8201.000000
48    8885.000000
49    6340.000000
50    6611.000000
Name: NUMBEERMO_EST, dtype: float64


In [10]:
fig = plt.figure()
# code for line chart
var2.plot(kind='line')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1a39bd87f0>

# Draw a stacked Column Chart
# x = age (AGE)
# y = number of beers consumed per month (NUMBEERMO_EST)
# stack is based on depedency on beer (S2BQ1B1)

# var3 = mean number of beers consumed a month, grouped by age and beer depedency (S2BQ1B1)

In [11]:
var3 = sub2.groupby(['AGE', 'S2BQ1B1']).NUMBEERMO_EST.mean()
print(var3)

AGE  S2BQ1B1 
26   0.000000    21.449239
     1.000000    49.947368
27   0.000000    23.809524
     1.000000    24.347826
28   0.000000    26.021127
     1.000000    67.460000
29   0.000000    20.869650
     1.000000    44.078947
30   0.000000    22.882671
     1.000000    54.086957
31   0.000000    23.728013
     1.000000    55.113636
32   0.000000    23.871753
     1.000000    47.722222
33   0.000000    23.255556
     1.000000    45.075000
34   0.000000    22.948052
     1.000000    47.250000
35   0.000000    28.266537
     1.000000    32.375000
36   0.000000    24.372881
     1.000000    56.800000
37   0.000000    24.361194
     1.000000   101.240000
38   0.000000    24.274390
     1.000000    61.619048
39   0.000000    26.789308
     1.000000    41.718750
40   0.000000    34.700608
     1.000000    46.477273
41   0.000000    21.989091
     1.000000    44.441176
42   0.000000    30.786645
     1.000000    58.029412
43   0.000000    37.151852
     1.000000    49.642857
44   0.000000 

In [12]:
# code for vertical stack chart
var3.unstack().plot(kind='bar', stacked=True,  color=['red','blue'], grid=False)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1a39b57b70>

# Draw a horizontal stacked Column Chart
# x = age (AGE)
# y = number of beers consumed per month (NUMBEERMO_EST)
# stack is based on depedency on  beer (S2BQ1B1)

In [13]:
# code for horizontal stack chart
var3.unstack().plot(kind='barh', stacked=True,  color=['red','blue'], grid=False)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1069419e8>

# Draw a Pie Chart showing age (AGE) and total beer consumed a month (NUMBEERMO_EST)
# hint use var2

In [14]:
print(var2)

AGE
26    5238.000000
27    6560.000000
28    7238.500000
29    7126.500000
30    7695.500000
31    8502.000000
32    8679.500000
33    7304.500000
34    8126.000000
35    7912.500000
36    8687.500000
37   10724.000000
38    9298.000000
39    9598.500000
40   12499.000000
41    6807.500000
42   10523.000000
43   10991.000000
44    9394.500000
45   11215.500000
46    8217.500000
47    8201.000000
48    8885.000000
49    6340.000000
50    6611.000000
Name: NUMBEERMO_EST, dtype: float64


In [15]:
fig = plt.figure()
# code for pie chart
var2.plot(kind='pie',autopct='%.2f')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1a1c2e3ef0>

# Draw a Violin Plot for age (AGE) and income (S1Q10A)

# convert income (S1Q10A) to numeric

In [17]:
sub2['S1Q10A'] = pd.to_numeric(nesarc['S1Q10A'])
#convert variable to numeric
sub2['S1Q10A'].head()

1      11000
8      65000
12     24000
16    130000
24      6500
Name: S1Q10A, dtype: int64

# Plot violin plot

In [18]:
fig = plt.figure()
#code for violin chart
sns.violinplot(x='AGE', y='S1Q10A', data=sub2)
plt.xlabel('Age')
plt.ylabel('Income')

<IPython.core.display.Javascript object>

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


Text(0, 0.5, 'Income')

# Draw a HeatMap for Ethnicity and Carton of Beer consumed per month, based on depedency on beer

# Rename Race - From Module 4

In [19]:
# you can rename categorical variable values for graphing if original values are not informative 
# first change the variable format to categorical if you haven’t already done so
sub2['ETHRACE2A'] = sub2['ETHRACE2A'].astype('category')

sub2['ETHRACE2A']=sub2['ETHRACE2A'].cat.rename_categories(["White", "Black", "NatAm", "Asian", "Hispanic"])

# Create a new variable CARTON_ADAY using CARTON_ADAY function provided

In [21]:
def CARTON_ADAY (row):
   if row['BEER_FEQMO'] >= 30 :
      return 1
   elif row['BEER_FEQMO'] < 30 :
      return 0
    
sub2['CARTON_ADAY'] = sub2.apply (lambda row: CARTON_ADAY (row),axis=1)

# Print the size of CARTON_ADAY, grouped by category

In [22]:
c4= sub2.groupby('CARTON_ADAY').size()
print(c4)

CARTON_ADAY
0.000000    6897
1.000000     417
dtype: int64


# Draw bar chart to show relationship between race (ETHRACE2A) and CARTON_ADAY

In [23]:
# bivariate bar graph C->C
%matplotlib notebook
#code for bar chart
sns.factorplot(x='ETHRACE2A', y='CARTON_ADAY', data=sub2, kind="bar", ci=None)
plt.xlabel('Ethnic Group')
plt.ylabel('Proportion of consumed a carton a day Beer Drinkers')



<IPython.core.display.Javascript object>

Text(0.6944444444444446, 0.5, 'Proportion of consumed a carton a day Beer Drinkers')

# Make copy of just race (ETHRACE2A) and CARTON_ADAY

In [24]:
sub3 = sub2[['ETHRACE2A','CARTON_ADAY']].copy()
sub3.head()

Unnamed: 0,ETHRACE2A,CARTON_ADAY
1,Hispanic,
8,White,
12,Asian,0.0
16,White,
24,Hispanic,


# Create pivot table of race (ETHRACE2A) and CARTON_ADAY

In [25]:
table = table = pd.pivot_table(sub3, index=['ETHRACE2A'], columns=['_ADAY'], aggfunc=np.size)
print(table)

CARTON_ADAY  0.000000  1.000000
ETHRACE2A                      
White            8312       456
Black            1972       234
NatAm             222        14
Asian             374        16
Hispanic         2914       114


# Draw heat map

In [26]:
fig = plt.figure()
sns.heatmap(table)
# code for heat map

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1a23b87780>

# Draw a bubble Chart

# Read in gapminder.csv

In [27]:
pd.set_option('display.float_format', lambda x:'%.2f'%x)

gapminder = pd.read_csv('gapminder.csv', low_memory=False)
gapminder.head()

Unnamed: 0,country,incomeperperson,alcconsumption,armedforcesrate,breastcancerper100th,co2emissions,femaleemployrate,hivrate,internetuserate,lifeexpectancy,oilperperson,polityscore,relectricperperson,suicideper100th,employrate,urbanrate
0,Afghanistan,,0.03,0.5696534,26.8,75944000.0,25.6000003814697,,3.65412162280064,48.673,,0.0,,6.68438529968262,55.7000007629394,24.04
1,Albania,1914.99655094922,7.29,1.0247361,57.4,223747333.333333,42.0999984741211,,44.9899469578783,76.918,,9.0,636.341383366604,7.69932985305786,51.4000015258789,46.72
2,Algeria,2231.99333515006,0.69,2.306817,23.5,2932108666.66667,31.7000007629394,0.1,12.5000733055148,73.131,0.42009452521537,2.0,590.509814347428,4.8487696647644,50.5,65.22
3,Andorra,21943.3398976022,10.17,,,,,,81.0,,,,,5.36217880249023,,88.92
4,Angola,1381.00426770244,5.57,1.4613288,23.1,248358000.0,69.4000015258789,2.0,9.99995388324075,51.093,,-2.0,172.999227388199,14.5546770095825,75.6999969482422,56.7


# Convert internetuserate, urbanrate and incomeperperson to numeric 

In [28]:
gapminder['internetuserate'] = pd.to_numeric(gapminder['internetuserate'],errors='coerce')
gapminder['urbanrate'] = pd.to_numeric(gapminder['urbanrate'],errors='coerce')
gapminder['incomeperperson'] = pd.to_numeric(gapminder['incomeperperson'],errors='coerce')

In [29]:
gapminder_clean=gapminder.dropna()

# Draw a bubble Chart
# x = urbanrate
# y = income per person
# bubble size = internetuserate

In [31]:
%matplotlib notebook
fig = plt.figure()
#bubble plot code
plt.scatter(gapminder_clean['incomeperperson'],gapminder_clean['urbanrate'], s=gapminder_clean['internetuserate'])
plt.xlabel('Ubran Rate')
plt.ylabel('Income Per Person')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Income Per Person')