### US Baby Names 데이터셋 분석하기

In [1]:
%matplotlib nbagg
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### 1. 각 (연도, 성별) 그룹의 총 출생횟수 산출하기


In [2]:
names = pd.read_csv("data/NationalNames.csv", sep=",", header=0, names=["id","name","year","sex","births"])

In [3]:
names.head()

Unnamed: 0,id,name,year,sex,births
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746


In [4]:
names.shape

(1825433, 5)

In [5]:
total_births = names.pivot_table("births", index="year", columns="sex", aggfunc=sum)

In [6]:
total_births.head()

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,90993,110491
1881,91954,100745
1882,107850,113688
1883,112321,104629
1884,129022,114445


In [7]:
total_births.tail()

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,1772738,1913851
2011,1753500,1893230
2012,1753922,1889414
2013,1745339,1881463
2014,1768775,1901376


In [8]:
ax = total_births.plot()

<IPython.core.display.Javascript object>

In [9]:
ax.set_title("total births by sex and year")

<matplotlib.text.Text at 0x110217190>

 #### 2. 각 (연도, 성별) 그룹 내에서 각 이름의 출생횟수가 전체에서 차지하는 비중(%)을 나타내는 열 추가하기

In [10]:
grouped_names_dict = dict(list(names.groupby(["year","sex"])))

In [13]:
grouped_names_dict.keys()[:10]

[(1886, 'M'),
 (1958, 'F'),
 (1970, 'M'),
 (1928, 'F'),
 (1991, 'M'),
 (1915, 'M'),
 (1991, 'F'),
 (1967, 'F'),
 (1906, 'F'),
 (1915, 'F')]

In [14]:
grouped_sample = grouped_names_dict[(2011,'M')]

In [15]:
grouped_sample.head(10)

Unnamed: 0,id,name,year,sex,births
1711173,1711174,Jacob,2011,M,20331
1711174,1711175,Mason,2011,M,19488
1711175,1711176,William,2011,M,17314
1711176,1711177,Jayden,2011,M,16954
1711177,1711178,Noah,2011,M,16838
1711178,1711179,Michael,2011,M,16744
1711179,1711180,Ethan,2011,M,16665
1711180,1711181,Alexander,2011,M,15681
1711181,1711182,Aiden,2011,M,15469
1711182,1711183,Daniel,2011,M,15249


In [16]:
def add_prop(agg_df):
    agg_births = agg_df["births"]
    agg_df["prop"] = agg_births / agg_births.sum()
    return agg_df

In [17]:
names_with_prop = names.groupby(["year","sex"]).apply(add_prop)

In [18]:
names_with_prop.head()

Unnamed: 0,id,name,year,sex,births,prop
0,1,Mary,1880,F,7065,0.077643
1,2,Anna,1880,F,2604,0.028618
2,3,Emma,1880,F,2003,0.022013
3,4,Elizabeth,1880,F,1939,0.021309
4,5,Minnie,1880,F,1746,0.019188


#### 3. 각 (연도, 성별) 그룹 내 출생횟수 기준 TOP 1000 이름 추출하기


In [19]:
grouped_names_with_prop_dict = dict(list(names_with_prop.groupby(["year","sex"])))

In [20]:
grouped_sample = grouped_names_with_prop_dict[(2011,'M')]

In [21]:
grouped_sample.head(10)

Unnamed: 0,id,name,year,sex,births,prop
1711173,1711174,Jacob,2011,M,20331,0.010739
1711174,1711175,Mason,2011,M,19488,0.010294
1711175,1711176,William,2011,M,17314,0.009145
1711176,1711177,Jayden,2011,M,16954,0.008955
1711177,1711178,Noah,2011,M,16838,0.008894
1711178,1711179,Michael,2011,M,16744,0.008844
1711179,1711180,Ethan,2011,M,16665,0.008802
1711180,1711181,Alexander,2011,M,15681,0.008283
1711181,1711182,Aiden,2011,M,15469,0.008171
1711182,1711183,Daniel,2011,M,15249,0.008054


In [22]:
grouped_sample.sort_values(by="births", ascending=False).iloc[:10]

Unnamed: 0,id,name,year,sex,births,prop
1711173,1711174,Jacob,2011,M,20331,0.010739
1711174,1711175,Mason,2011,M,19488,0.010294
1711175,1711176,William,2011,M,17314,0.009145
1711176,1711177,Jayden,2011,M,16954,0.008955
1711177,1711178,Noah,2011,M,16838,0.008894
1711178,1711179,Michael,2011,M,16744,0.008844
1711179,1711180,Ethan,2011,M,16665,0.008802
1711180,1711181,Alexander,2011,M,15681,0.008283
1711181,1711182,Aiden,2011,M,15469,0.008171
1711182,1711183,Daniel,2011,M,15249,0.008054


In [23]:
def get_top1000(agg_df):
    top1000_df = agg_df.sort_values(by="births",ascending=False).iloc[:1000]
    return top1000_df

In [24]:
top1000_names = names_with_prop.groupby(["year","sex"]).apply(get_top1000)

In [25]:
top1000_names.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,name,year,sex,births,prop
year,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1880,F,0,1,Mary,1880,F,7065,0.077643
1880,F,1,2,Anna,1880,F,2604,0.028618
1880,F,2,3,Emma,1880,F,2003,0.022013
1880,F,3,4,Elizabeth,1880,F,1939,0.021309
1880,F,4,5,Minnie,1880,F,1746,0.019188


In [26]:
names_with_prop.loc[(names_with_prop["year"] == 2011) & \
                    (names_with_prop["sex"] == "M")].shape

(14329, 6)

In [27]:
top1000_names.loc[(2011,"M")].shape

(1000, 6)

In [28]:
top1000_names.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,name,year,sex,births,prop
year,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1880,F,0,1,Mary,1880,F,7065,0.077643
1880,F,1,2,Anna,1880,F,2604,0.028618
1880,F,2,3,Emma,1880,F,2003,0.022013
1880,F,3,4,Elizabeth,1880,F,1939,0.021309
1880,F,4,5,Minnie,1880,F,1746,0.019188


#### 4. 각 연도에 따른 전체 출생횟수 대비 TOP 1000 이름들의 출생횟수 비중 산출하기


In [29]:
top1000_props = top1000_names.pivot_table("prop", index="year", columns="sex", aggfunc=sum)

Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)


In [30]:
top1000_props.head(10)

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,1.0,0.997375
1881,1.0,1.0
1882,0.998702,0.995646
1883,0.997596,0.998566
1884,0.993156,0.994539
1885,0.992251,0.995501
1886,0.989504,0.995035
1887,0.988279,0.996697
1888,0.984241,0.992429
1889,0.984061,0.994981


In [31]:
ax = top1000_props.plot()

<IPython.core.display.Javascript object>

In [33]:
ax.set_xticks(range(1880,2020,10))
ax.set_yticks(np.arange(0,1.3,0.1))

[<matplotlib.axis.YTick at 0x123f44810>,
 <matplotlib.axis.YTick at 0x112c4bc90>,
 <matplotlib.axis.YTick at 0x124165650>,
 <matplotlib.axis.YTick at 0x124165d50>,
 <matplotlib.axis.YTick at 0x12417b390>,
 <matplotlib.axis.YTick at 0x12417ba90>,
 <matplotlib.axis.YTick at 0x1241641d0>,
 <matplotlib.axis.YTick at 0x124165850>,
 <matplotlib.axis.YTick at 0x1240cd9d0>,
 <matplotlib.axis.YTick at 0x110279390>,
 <matplotlib.axis.YTick at 0x1240a6450>,
 <matplotlib.axis.YTick at 0x1240cde90>,
 <matplotlib.axis.YTick at 0x12310f3d0>]

#### 5. 특정 이름들의 연도에 따른 출생횟수 변화 추이 분석하기

In [34]:
top1000_names.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,name,year,sex,births,prop
year,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1880,F,0,1,Mary,1880,F,7065,0.077643
1880,F,1,2,Anna,1880,F,2604,0.028618
1880,F,2,3,Emma,1880,F,2003,0.022013
1880,F,3,4,Elizabeth,1880,F,1939,0.021309
1880,F,4,5,Minnie,1880,F,1746,0.019188


In [35]:
top_names_births = top1000_names.pivot_table("births", index="year", columns="name", aggfunc=sum)

In [36]:
top_names_births.head()

name,Aaden,Aaliyah,Aanya,Aarav,Aaron,Aarush,Ab,Abagail,Abb,Abbey,...,Zoa,Zoe,Zoey,Zoie,Zola,Zollie,Zona,Zora,Zula,Zuri
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1880,,,,,102.0,,,,,,...,8.0,23.0,,,7.0,,8.0,28.0,27.0,
1881,,,,,94.0,,,,,,...,,22.0,,,10.0,,9.0,21.0,27.0,
1882,,,,,85.0,,,,,,...,8.0,25.0,,,9.0,,17.0,32.0,21.0,
1883,,,,,105.0,,,,,,...,,23.0,,,10.0,,11.0,35.0,25.0,
1884,,,,,97.0,,,,,,...,13.0,31.0,,,14.0,6.0,8.0,58.0,27.0,


In [37]:
top_names_births_subset = top_names_births[["John","Harry","Mary","Marilyn"]]

In [38]:
top_names_births_subset.head()

name,John,Harry,Mary,Marilyn
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1880,9701.0,2158.0,7092.0,
1881,8795.0,2002.0,6948.0,
1882,9597.0,2246.0,8178.0,
1883,8934.0,2116.0,8044.0,
1884,9428.0,2338.0,9253.0,


In [39]:
ax = top_names_births_subset.plot(subplots=True, fontsize=8)

<IPython.core.display.Javascript object>

#### 6. 남아 이름의 마지막 글자가 연도에 따라 어떻게 변화하였는지 분석하기

In [40]:
# 문자열로 구성된 시리즈에 대하여 맨 마지막 글자만 추출하는 함수
get_last_letter = lambda x : x[-1]

In [41]:
names.head()

Unnamed: 0,id,name,year,sex,births
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746


In [42]:
names["last_letters"] = names["name"].apply(get_last_letter)

In [43]:
names.head()

Unnamed: 0,id,name,year,sex,births,last_letters
0,1,Mary,1880,F,7065,y
1,2,Anna,1880,F,2604,a
2,3,Emma,1880,F,2003,a
3,4,Elizabeth,1880,F,1939,h
4,5,Minnie,1880,F,1746,e


In [44]:
last_letters_table = names.pivot_table("births", index="last_letters", columns=["sex","year"], aggfunc=sum)

In [45]:
last_letters_table.head()

sex,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,M,M,M,M
year,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
last_letters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,31446.0,31581.0,36536.0,38330.0,43680.0,45408.0,49100.0,48942.0,59441.0,58632.0,...,36877.0,36210.0,34723.0,32988.0,31573.0,28814.0,27384.0,27136.0,27299.0,27931.0
b,,,,,,,,,,,...,43178.0,42645.0,42190.0,40047.0,39038.0,39208.0,36605.0,34626.0,33089.0,31085.0
c,,,5.0,5.0,,,,,,,...,26102.0,26661.0,26893.0,25365.0,24127.0,23307.0,23085.0,24209.0,23970.0,23617.0
d,609.0,607.0,734.0,810.0,916.0,862.0,1007.0,1027.0,1298.0,1374.0,...,50730.0,51474.0,50686.0,48018.0,46310.0,44758.0,43158.0,42376.0,42533.0,43641.0
e,33380.0,34080.0,40399.0,41913.0,48089.0,49616.0,53884.0,54353.0,66750.0,66661.0,...,141237.0,143126.0,143915.0,141234.0,135857.0,130073.0,128572.0,128174.0,131725.0,131036.0


In [46]:
three_years_subtable = last_letters_table.reindex(columns=[1910,1960,2010], level="year")

In [47]:
three_years_subtable.head()

sex,F,F,F,M,M,M
year,1910,1960,2010,1910,1960,2010
last_letters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,108397.0,691245.0,675901.0,977.0,5214.0,28814.0
b,,694.0,454.0,411.0,3912.0,39208.0
c,5.0,49.0,953.0,482.0,15466.0,23307.0
d,6751.0,3728.0,2635.0,22113.0,262143.0,44758.0
e,133601.0,435048.0,316288.0,28665.0,178810.0,130073.0


In [48]:
three_years_subtable.sum()

sex  year
F    1910     396501.0
     1960    2022093.0
     2010    1772738.0
M    1910     194218.0
     1960    2132717.0
     2010    1913851.0
dtype: float64

In [49]:
three_years_letters_prop = three_years_subtable / three_years_subtable.sum()

In [50]:
three_years_letters_prop.head()

sex,F,F,F,M,M,M
year,1910,1960,2010,1910,1960,2010
last_letters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,0.273384,0.341846,0.381275,0.00503,0.002445,0.015056
b,,0.000343,0.000256,0.002116,0.001834,0.020486
c,1.3e-05,2.4e-05,0.000538,0.002482,0.007252,0.012178
d,0.017026,0.001844,0.001486,0.113857,0.122915,0.023386
e,0.33695,0.215147,0.178418,0.147592,0.083841,0.067964


In [51]:
fig, axes = plt.subplots(2,1)

<IPython.core.display.Javascript object>

In [52]:
three_years_letters_prop["M"].plot(kind="bar",ax=axes[0], title="Male")

<matplotlib.axes._subplots.AxesSubplot at 0x122c2ba10>

In [53]:
three_years_letters_prop["F"].plot(kind="bar", ax=axes[1], title="Female")

<matplotlib.axes._subplots.AxesSubplot at 0x122e68850>

In [54]:
# 그래프 정렬 (글자겹침문제해소)
plt.tight_layout()

In [55]:
last_letters_table.head()

sex,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,M,M,M,M
year,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
last_letters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,31446.0,31581.0,36536.0,38330.0,43680.0,45408.0,49100.0,48942.0,59441.0,58632.0,...,36877.0,36210.0,34723.0,32988.0,31573.0,28814.0,27384.0,27136.0,27299.0,27931.0
b,,,,,,,,,,,...,43178.0,42645.0,42190.0,40047.0,39038.0,39208.0,36605.0,34626.0,33089.0,31085.0
c,,,5.0,5.0,,,,,,,...,26102.0,26661.0,26893.0,25365.0,24127.0,23307.0,23085.0,24209.0,23970.0,23617.0
d,609.0,607.0,734.0,810.0,916.0,862.0,1007.0,1027.0,1298.0,1374.0,...,50730.0,51474.0,50686.0,48018.0,46310.0,44758.0,43158.0,42376.0,42533.0,43641.0
e,33380.0,34080.0,40399.0,41913.0,48089.0,49616.0,53884.0,54353.0,66750.0,66661.0,...,141237.0,143126.0,143915.0,141234.0,135857.0,130073.0,128572.0,128174.0,131725.0,131036.0


In [56]:
letters_prop = last_letters_table / last_letters_table.sum()

In [57]:
letters_prop.head()

sex,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,M,M,M,M
year,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
last_letters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,0.345587,0.343443,0.338767,0.341254,0.338547,0.341272,0.33971,0.335261,0.332766,0.328717,...,0.018486,0.017643,0.016757,0.0162,0.015952,0.015056,0.014464,0.014362,0.014509,0.01469
b,,,,,,,,,,,...,0.021645,0.020778,0.020361,0.019667,0.019723,0.020486,0.019335,0.018326,0.017587,0.016349
c,,,4.6e-05,4.5e-05,,,,,,,...,0.013085,0.01299,0.012978,0.012456,0.01219,0.012178,0.012193,0.012813,0.01274,0.012421
d,0.006693,0.006601,0.006806,0.007211,0.0071,0.006479,0.006967,0.007035,0.007267,0.007703,...,0.025431,0.02508,0.024461,0.023581,0.023397,0.023386,0.022796,0.022428,0.022606,0.022952
e,0.366841,0.37062,0.374585,0.373154,0.372719,0.372898,0.372809,0.372327,0.373684,0.373732,...,0.070801,0.069737,0.069452,0.069359,0.068639,0.067964,0.067911,0.067838,0.070012,0.068916


In [58]:
dny_prop = letters_prop.loc[["d","n","y"],"M"]

In [59]:
dny_prop.head()

year,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
last_letters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d,0.083057,0.08324,0.085339,0.084059,0.08612,0.085473,0.087648,0.089071,0.08771,0.091921,...,0.025431,0.02508,0.024461,0.023581,0.023397,0.023386,0.022796,0.022428,0.022606,0.022952
n,0.153216,0.153209,0.149558,0.15165,0.149924,0.146354,0.149661,0.148845,0.151291,0.151984,...,0.344296,0.351653,0.358228,0.361046,0.362462,0.362458,0.364154,0.360269,0.354485,0.351338
y,0.075762,0.077453,0.077537,0.079146,0.080405,0.081883,0.081681,0.082868,0.084921,0.086333,...,0.062807,0.060342,0.059643,0.060346,0.057242,0.058183,0.058074,0.057568,0.056305,0.05501


In [61]:
# 행과 열의 위치를 바꾼다.
dny_prop.transpose().iloc[:10]

last_letters,d,n,y
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1880,0.083057,0.153216,0.075762
1881,0.08324,0.153209,0.077453
1882,0.085339,0.149558,0.077537
1883,0.084059,0.15165,0.079146
1884,0.08612,0.149924,0.080405
1885,0.085473,0.146354,0.081883
1886,0.087648,0.149661,0.081681
1887,0.089071,0.148845,0.082868
1888,0.08771,0.151291,0.084921
1889,0.091921,0.151984,0.086333


In [62]:
dny_prop.transpose().plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x12615b310>