# Import Libraries

In [3]:
import pandas
import numpy
import scipy.stats
import seaborn
import matplotlib.pyplot as plt

# Load datas

We are gonna check if SEX, S1Q1E, S1Q9A, S1Q238, S1Q239 are significant on CONSUMER DRINKING STATUS

In [10]:
data = pandas.read_csv('nesarc.csv', low_memory=False)
data = data.loc[data.index,["S2BQ1A9D", "S2AQ22", "AGE", "S2AQ12E", "S2AQ12C", "S2AQ8A", "S2AQ7I"]]
data = data[(data['AGE']>=18) & (data['AGE']<=25)]

S2AQ7I : WHERE USUALLY DRANK LIQUOR IN LAST 12 MONTHS
1. In own home
2. In homes of friends or relatives
3. In public places

S2AQ8A : HOW OFTEN DRANK ANY ALCOHOL IN LAST 12 MONTHS
1. Every day
2. Nearly every day
3. 3 to 4 times a week
4. 2 times a week
5. Once a week
6. 2 to 3 times a month
7. Once a month
8. 7 to 11 times in the last year
9. 3 to 6 times in the last year
10. 1 or 2 times in the last year
99. Unknown

S2AQ8A : HOW OFTEN DRANK ANY ALCOHOL IN LAST 12 MONTHS
1. Every day & Nearly every day
2. Each week 
3. Each month
4. Less than each month


In [11]:
data['S2BQ1A9D'] = pandas.to_numeric(data['S2BQ1A9D'], errors='coerce')
data['S2AQ22'] = pandas.to_numeric(data['S2AQ22'], errors='coerce')
data['AGE'] = pandas.to_numeric(data['AGE'], errors='coerce')
data['S2AQ12E'] = pandas.to_numeric(data['S2AQ12E'], errors='coerce')
data['S2AQ12C'] = pandas.to_numeric(data['S2AQ12C'], errors='coerce')
data['S2AQ8A'] = pandas.to_numeric(data['S2AQ8A'], errors='coerce')
data['S2AQ7I'] = pandas.to_numeric(data['S2AQ7I'], errors='coerce')

data['S2AQ7I'] = data['S2AQ7I'].replace(9, numpy.nan)
data['S2AQ7I'] = data['S2AQ7I'].replace(2, numpy.nan)
data["S2AQ8A"] = data['S2AQ8A'].replace(99.0, numpy.nan)
data["S2AQ8A"] = data['S2AQ8A'].replace(2, 1)
data["S2AQ8A"] = data['S2AQ8A'].replace(3, 2)
data["S2AQ8A"] = data['S2AQ8A'].replace(4, 2)
data["S2AQ8A"] = data['S2AQ8A'].replace(5, 2)
data["S2AQ8A"] = data['S2AQ8A'].replace(6, 3)
data["S2AQ8A"] = data['S2AQ8A'].replace(7, 3)
data["S2AQ8A"] = data['S2AQ8A'].replace(8, 4)
data["S2AQ8A"] = data['S2AQ8A'].replace(9, 4)
data["S2AQ8A"] = data['S2AQ8A'].replace(10, 4)

data = data.dropna()

data.head()

Unnamed: 0,S2BQ1A9D,S2AQ22,AGE,S2AQ12E,S2AQ12C,S2AQ8A,S2AQ7I
30,2.0,8.0,20,11.0,11.0,3.0,1.0
76,2.0,9.0,21,11.0,11.0,4.0,1.0
102,1.0,5.0,24,11.0,10.0,2.0,3.0
104,2.0,11.0,21,11.0,11.0,2.0,3.0
120,1.0,4.0,21,10.0,9.0,3.0,3.0


In [12]:
ct1=pandas.crosstab(data['S2AQ7I'], data['S2AQ8A'])
print (ct1)

# chi-square
print ('chi-square value, p value, expected counts')
cs1= scipy.stats.chi2_contingency(ct1)
print (cs1)

S2AQ8A  1.0  2.0  3.0  4.0
S2AQ7I                    
1.0      60  213  133   97
3.0      79  456  287  194
chi-square value, p value, expected counts
(7.235465269774268, 0.06475959854288472, 3, array([[ 46.0283081 , 221.5319289 , 139.07834101,  96.36142199],
       [ 92.9716919 , 447.4680711 , 280.92165899, 194.63857801]]))


The p-value is 0.06475959854288472 we can accept that there is a relationship between the place we drink and the frequency

In [13]:
recode2 = {1: 1, 2: 2}
sub1 = data
sub1['COMP1v2']= sub1['S2AQ8A'].map(recode2)

# contingency table of observed counts
ct2=pandas.crosstab(sub1['S2AQ7I'], sub1['COMP1v2'])
print (ct2)

# column percentages
colsum=ct2.sum(axis=0)
colpct=ct2/colsum
print(colpct)

print ('chi-square value, p value, expected counts')
cs2= scipy.stats.chi2_contingency(ct2)
print (cs2)

COMP1v2  1.0  2.0
S2AQ7I           
1.0       60  213
3.0       79  456
COMP1v2       1.0       2.0
S2AQ7I                     
1.0      0.431655  0.318386
3.0      0.568345  0.681614
chi-square value, p value, expected counts
(6.103615341536044, 0.01349056081380569, 1, array([[ 46.96410891, 226.03589109],
       [ 92.03589109, 442.96410891]]))


In [14]:
recode3 = {1: 1, 3: 3}
sub2 = data
sub2['COMP1v3']= sub2['S2AQ8A'].map(recode3)

# contingency table of observed counts
ct3=pandas.crosstab(sub2['S2AQ7I'], sub2['COMP1v3'])
print (ct3)

# column percentages
colsum=ct3.sum(axis=0)
colpct=ct3/colsum
print(colpct)

print ('chi-square value, p value, expected counts')
cs3= scipy.stats.chi2_contingency(ct3)
print (cs3)

COMP1v3  1.0  3.0
S2AQ7I           
1.0       60  133
3.0       79  287
COMP1v3       1.0       3.0
S2AQ7I                     
1.0      0.431655  0.316667
3.0      0.568345  0.683333
chi-square value, p value, expected counts
(5.61053057488484, 0.017852856557252925, 1, array([[ 47.99105546, 145.00894454],
       [ 91.00894454, 274.99105546]]))


In [15]:
recode4 = {1: 1, 4: 4}
sub3 = data
sub3['COMP1v4']= sub3['S2AQ8A'].map(recode4)

# contingency table of observed counts
ct4=pandas.crosstab(sub3['S2AQ7I'], sub3['COMP1v4'])
print (ct4)

# column percentages
colsum=ct4.sum(axis=0)
colpct=ct4/colsum
print(colpct)

print ('chi-square value, p value, expected counts')
cs4= scipy.stats.chi2_contingency(ct4)
print (cs4)

COMP1v4  1.0  4.0
S2AQ7I           
1.0       60   97
3.0       79  194
COMP1v4       1.0       4.0
S2AQ7I                     
1.0      0.431655  0.333333
3.0      0.568345  0.666667
chi-square value, p value, expected counts
(3.5102330369737156, 0.06099087608635307, 1, array([[ 50.75116279, 106.24883721],
       [ 88.24883721, 184.75116279]]))


In [16]:
recode5 = {2: 2, 3: 3}
sub4 = data
sub4['COMP2v3']= sub4['S2AQ8A'].map(recode5)

# contingency table of observed counts
ct5=pandas.crosstab(sub4['S2AQ7I'], sub4['COMP2v3'])
print (ct5)

# column percentages
colsum=ct5.sum(axis=0)
colpct=ct5/colsum
print(colpct)

print ('chi-square value, p value, expected counts')
cs5= scipy.stats.chi2_contingency(ct5)
print (cs5)

COMP2v3  2.0  3.0
S2AQ7I           
1.0      213  133
3.0      456  287
COMP2v3       2.0       3.0
S2AQ7I                     
1.0      0.318386  0.316667
3.0      0.681614  0.683333
chi-square value, p value, expected counts
(5.7021388787849715e-05, 0.9939750308230958, 1, array([[212.55647383, 133.44352617],
       [456.44352617, 286.55647383]]))


In [17]:
recode6 = {2: 2, 4: 4}
sub5 = data
sub5['COMP2v4']= sub5['S2AQ8A'].map(recode6)

# contingency table of observed counts
ct6=pandas.crosstab(sub5['S2AQ7I'], sub4['COMP2v4'])
print (ct6)

# column percentages
colsum=ct6.sum(axis=0)
colpct=ct6/colsum
print(colpct)

print ('chi-square value, p value, expected counts')
cs6= scipy.stats.chi2_contingency(ct6)
print (cs6)

COMP2v4  2.0  4.0
S2AQ7I           
1.0      213   97
3.0      456  194
COMP2v4       2.0       4.0
S2AQ7I                     
1.0      0.318386  0.333333
3.0      0.681614  0.666667
chi-square value, p value, expected counts
(0.14450724554055944, 0.7038406884581554, 1, array([[216.03125,  93.96875],
       [452.96875, 197.03125]]))


In [18]:
recode7 = {3: 3, 4: 4}
sub6 = data
sub6['COMP3v4']= sub6['S2AQ8A'].map(recode7)

# contingency table of observed counts
ct7=pandas.crosstab(sub6['S2AQ7I'], sub6['COMP3v4'])
print (ct7)

# column percentages
colsum=ct7.sum(axis=0)
colpct=ct7/colsum
print(colpct)

print ('chi-square value, p value, expected counts')
cs7= scipy.stats.chi2_contingency(ct7)
print (cs7)

COMP3v4  3.0  4.0
S2AQ7I           
1.0      133   97
3.0      287  194
COMP3v4       3.0       4.0
S2AQ7I                     
1.0      0.316667  0.333333
3.0      0.683333  0.666667
chi-square value, p value, expected counts
(0.14867849361742225, 0.6998014493089915, 1, array([[135.8649789,  94.1350211],
       [284.1350211, 196.8649789]]))


In [24]:
out = pandas.DataFrame(numpy.array([["X","","",""], [0.01349056081380569/4,"X","",""],[0.017852856557252925/4, 0.9939750308230958/4, "X","" ], [0.06099087608635307/4, 0.7038406884581554/4, 0.6998014493089915/4, "X"]]), columns=['1', '2', '3', '4'], index=['1', '2', '3', '4'])

In [25]:
out

Unnamed: 0,1,2,3,4
1,X,,,
2,0.0033726402034514225,X,,
3,0.004463214139313231,0.24849375770577395,X,
4,0.015247719021588267,0.17596017211453885,0.17495036232724787,X


Then we can compare between coupleThe place we chose to drink lequor depend on our frequence of drinking 