# Summary

Our research question is "How have gun ownership practices and social attitudes towards gun control changed over the past 50 years?"

# Data

In [302]:
import pandas as pd
import numpy as np

df = pd.read_csv('selected_gss_data.csv', low_memory=False)
print(df.shape, '\n')
print(df.dtypes, '\n')
df.head()

(72390, 5) 

year       int64
id         int64
gunlaw    object
owngun    object
hunt      object
dtype: object 



Unnamed: 0,year,id,gunlaw,owngun,hunt
0,1972,1,favor,,
1,1972,2,favor,,
2,1972,3,favor,,
3,1972,4,favor,,
4,1972,5,favor,,


### Verify year/id is clean

In [303]:
year = df['year']
year.unique()

array([1972, 1973, 1974, 1975, 1976, 1977, 1978, 1980, 1982, 1983, 1984,
       1985, 1986, 1987, 1988, 1989, 1990, 1991, 1993, 1994, 1996, 1998,
       2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2021,
       2022])

In [304]:
year.value_counts()     # Represents how many respondents were in each year

year
2006    4510
2021    4032
2022    3544
1994    2992
1996    2904
2016    2867
1998    2832
2000    2817
2004    2812
2002    2765
2014    2538
2018    2348
2010    2044
2008    2023
2012    1974
1982    1860
1987    1819
1972    1613
1993    1606
1983    1599
1989    1537
1985    1534
1978    1532
1977    1530
1991    1517
1973    1504
1976    1499
1975    1490
1974    1484
1988    1481
1984    1473
1986    1470
1980    1468
1990    1372
Name: count, dtype: int64

In [305]:
print('Total missing: ', sum(year.isnull()))

Total missing:  0


In [306]:
id = df['id']
id.unique()

array([   1,    2,    3, ..., 4508, 4509, 4510])

In [307]:
id.value_counts()   # Represents how many years each respondant participated in

id
1       34
727     34
711     34
712     34
713     34
        ..
4278     1
3671     1
3669     1
3665     1
4510     1
Name: count, Length: 4510, dtype: int64

In [308]:
print('Total missing: ', sum(id.isnull()))

Total missing:  0


### Clean gunlaw

In [309]:
gunlaw = df['gunlaw']
gunlaw.value_counts()

gunlaw
favor     36367
oppose    11940
Name: count, dtype: int64

From the codebook, the missing values represent "Don't Know", "No Answer", "Not Applicable", or "Skipped on Web"

In [310]:
print('Total missing: ', sum(gunlaw.isnull()))

Total missing:  24083


Make a subset of the dataframe for observations with valid responses to the gunlaw question

In [311]:
del gunlaw
df_gunlaw = df.dropna(axis=0, subset=['gunlaw'])
print(sum(df_gunlaw['gunlaw'].isnull()))
df_gunlaw.head(10)

0


Unnamed: 0,year,id,gunlaw,owngun,hunt
0,1972,1,favor,,
1,1972,2,favor,,
2,1972,3,favor,,
3,1972,4,favor,,
4,1972,5,favor,,
5,1972,6,oppose,,
6,1972,7,favor,,
7,1972,8,favor,,
8,1972,9,favor,,
9,1972,10,favor,,


### Clean owngun

In [312]:
owngun = df['owngun']
owngun.value_counts()

owngun
no             27551
yes            17800
refused_(3)      512
Name: count, dtype: int64

In [313]:
print('Total missing: ', sum(owngun.isnull()))

Total missing:  26527


Replace refuesed with nan, they chose to not answer

In [314]:
owngun = owngun.replace(['refused_(3)'], np.nan)
owngun.value_counts()

owngun
no     27551
yes    17800
Name: count, dtype: int64

In [315]:
print('Total missing: ', sum(owngun.isnull()))

Total missing:  27039


In [316]:
df['owngun'] = owngun
del owngun
print('Total missing: ', sum(df['owngun'].isnull()))
df['owngun'].value_counts()

Total missing:  27039


owngun
no     27551
yes    17800
Name: count, dtype: int64

Make a subset of the dataframe for observations with valid responses to the owngun question

In [317]:
df_owngun = df.dropna(axis=0, subset=['owngun'])
print(sum(df_owngun['owngun'].isnull()))
df_owngun.head(10)

0


Unnamed: 0,year,id,gunlaw,owngun,hunt
1613,1973,1,oppose,yes,
1614,1973,2,oppose,yes,
1615,1973,3,oppose,yes,
1616,1973,4,oppose,yes,
1617,1973,5,favor,yes,
1618,1973,6,favor,no,
1619,1973,7,favor,no,
1620,1973,8,favor,no,
1621,1973,9,favor,no,
1622,1973,10,favor,no,


### Clean hunt

In [318]:
hunt = df['hunt']
hunt.value_counts()

hunt
no, neither r nor spouse hunts    33607
yes, respondent does               4975
yes, spouse does                   2072
yes, both do                        920
Name: count, dtype: int64

In [319]:
print('Total missing: ', sum(hunt.isnull()))

Total missing:  30816


Combine responses for hunt so the question simply represents either the respondent's family does hunt or not

In [320]:
hunt = hunt.replace(['yes, respondent does', 'yes, spouse does','yes, both do'], 'yes')
hunt = hunt.replace(['no, neither r nor spouse hunts'], 'no')
hunt.value_counts()

hunt
no     33607
yes     7967
Name: count, dtype: int64

In [321]:
df['hunt'] = hunt
del hunt
print('Total missing: ', sum(df['hunt'].isnull()))
df['hunt'].value_counts()

Total missing:  30816


hunt
no     33607
yes     7967
Name: count, dtype: int64

Make a subset of the dataframe for observations with valid responses to the hunt question

In [322]:
df_hunt = df.dropna(axis=0, subset=['hunt'])
print(sum(df_hunt['hunt'].isnull()))
df_hunt.head(10)

0


Unnamed: 0,year,id,gunlaw,owngun,hunt
7590,1977,1,favor,no,no
7591,1977,2,oppose,no,no
7592,1977,3,favor,no,no
7593,1977,4,favor,no,no
7594,1977,5,favor,no,no
7595,1977,6,favor,no,no
7596,1977,7,favor,no,no
7597,1977,8,favor,no,no
7598,1977,9,oppose,no,no
7599,1977,10,favor,no,no


### Final cleaning

Drop rows in overall df where all variables of interest (gunlaw, owngun, and hunt) are nan

In [323]:
print(df.shape)
df = df.dropna(axis=0, subset=['gunlaw', 'owngun', 'hunt'], how='all')
print(df.shape)

(72390, 5)
(49129, 5)


Export to csv

In [324]:
df.to_csv('cleaned_gss.csv', header=['year', 'id', 'gunlaw', 'owngun', 'hunt'], index=False)

# Results

In [325]:
pd.crosstab(df['gunlaw'],df['owngun'],normalize='index')  # this is across all time, try grouping by 10 year ranges to see a trend

owngun,no,yes
gunlaw,Unnamed: 1_level_1,Unnamed: 2_level_1
favor,0.668511,0.331489
oppose,0.411541,0.588459


In [326]:
pd.crosstab(df['gunlaw'],df['hunt'],normalize='index')

hunt,no,yes
gunlaw,Unnamed: 1_level_1,Unnamed: 2_level_1
favor,0.85851,0.14149
oppose,0.650559,0.349441
