In [1]:
import numpy as np
import pandas as pd

# Read the file
arbuthnot = pd.read_csv("arbuthnot.csv")
arbuthnot.head()

Unnamed: 0.1,Unnamed: 0,year,boys,girls
0,1,1629,5218,4683
1,2,1630,4858,4457
2,3,1631,4422,4102
3,4,1632,4994,4590
4,5,1633,5158,4839


In [2]:
# Get a list with the column names
arbuthnot.columns

Index(['Unnamed: 0', 'year', 'boys', 'girls'], dtype='object')

In [3]:
# Delete the following column. It's the index column from the R DataFrame, python makes new indices so we don't
# need it.
del arbuthnot["Unnamed: 0"]
arbuthnot.head()

Unnamed: 0,year,boys,girls
0,1629,5218,4683
1,1630,4858,4457
2,1631,4422,4102
3,1632,4994,4590
4,1633,5158,4839


In [4]:
# See the dimensions of the DF. 82 observations with 3 variables.
arbuthnot.shape

(82, 3)

In [5]:
# See the data type of each column. All integers.
arbuthnot.dtypes

year     int64
boys     int64
girls    int64
dtype: object

In [26]:
# Access the data of a single column separately. 
arbuthnot["boys"]

0     5218
1     4858
2     4422
3     4994
4     5158
5     5035
6     5106
7     4917
8     4703
9     5359
10    5366
11    5518
12    5470
13    5460
14    4793
15    4107
16    4047
17    3768
18    3796
19    3363
20    3079
21    2890
22    3231
23    3220
24    3196
25    3441
26    3655
27    3668
28    3396
29    3157
      ... 
52    6822
53    6909
54    7577
55    7575
56    7484
57    7575
58    7737
59    7487
60    7604
61    7909
62    7662
63    7602
64    7676
65    6985
66    7263
67    7632
68    8062
69    8426
70    7911
71    7578
72    8102
73    8031
74    7765
75    6113
76    8366
77    7952
78    8379
79    8239
80    7840
81    7640
Name: boys, dtype: int64

In [16]:
%matplotlib notebook
import matplotlib.pyplot as plt

# Scatter plot of the number of girls baptized per year
plt.scatter(arbuthnot["year"], arbuthnot["girls"])
plt.xlabel("Year")
plt.ylabel("Girls")
plt.show()

<IPython.core.display.Javascript object>

In [9]:
# Add a new column consisting of the total births per year
arbuthnot["Total"] = arbuthnot["boys"] + arbuthnot["girls"]
arbuthnot.head()

Unnamed: 0,year,boys,girls,Total
0,1629,5218,4683,9901
1,1630,4858,4457,9315
2,1631,4422,4102,8524
3,1632,4994,4590,9584
4,1633,5158,4839,9997


In [17]:
%matplotlib notebook

# Line plot of the total number of baptisms per year
plt.plot(arbuthnot["year"], arbuthnot["Total"])
plt.xlabel("Year")
plt.ylabel("Total")
plt.show()

<IPython.core.display.Javascript object>

In [10]:
# Add a new column with the proportion of boys born over time
arbuthnot["prop_boys"] = arbuthnot["boys"] / arbuthnot["Total"]
arbuthnot.head()

Unnamed: 0,year,boys,girls,Total,prop_boys
0,1629,5218,4683,9901,0.527017
1,1630,4858,4457,9315,0.521524
2,1631,4422,4102,8524,0.518771
3,1632,4994,4590,9584,0.521077
4,1633,5158,4839,9997,0.515955


In [20]:
%matplotlib notebook

# Scatter plot and line plot of the proportion of boys baptized per year
plt.scatter(arbuthnot["year"], arbuthnot["prop_boys"])
plt.plot(arbuthnot["year"], arbuthnot["prop_boys"])
plt.xlabel("Year")
plt.ylabel("prop_boys")
plt.show()

<IPython.core.display.Javascript object>

In [26]:
# Add a column that evaluates if number of boys baptized each year is greater than that of girls
arbuthnot["more_boys"] = arbuthnot["boys"] > arbuthnot["girls"] 
arbuthnot.head()

Unnamed: 0,year,boys,girls,Total,prop_boys,more_boys
0,1629,5218,4683,9901,0.527017,True
1,1630,4858,4457,9315,0.521524,True
2,1631,4422,4102,8524,0.518771,True
3,1632,4994,4590,9584,0.521077,True
4,1633,5158,4839,9997,0.515955,True


In [29]:
# Dataset 2: Present birth records

present = pd.read_csv("present.csv")
del present["Unnamed: 0"]
present.head()

Unnamed: 0,year,boys,girls
0,1940,1211684,1148715
1,1941,1289734,1223693
2,1942,1444365,1364631
3,1943,1508959,1427901
4,1944,1435301,1359499


In [30]:
# Find the range of the years in this dataset
present.describe()

Unnamed: 0,year,boys,girls
count,74.0,74.0,74.0
mean,1976.5,1917502.0,1825037.0
std,21.505813,223897.8,216290.4
min,1940.0,1211684.0,1148715.0
25%,1958.25,1823071.0,1731210.0
50%,1976.5,1988038.0,1897810.0
75%,1994.75,2076156.0,1979778.0
max,2013.0,2208071.0,2108162.0


In [31]:
present["total"] = present["boys"] + present["girls"]
present["prop_boys"] = present["boys"] / present["total"]
present.head()

Unnamed: 0,year,boys,girls,total,prop_boys
0,1940,1211684,1148715,2360399,0.513339
1,1941,1289734,1223693,2513427,0.513138
2,1942,1444365,1364631,2808996,0.514193
3,1943,1508959,1427901,2936860,0.5138
4,1944,1435301,1359499,2794800,0.513561


In [33]:
%matplotlib notebook

plt.scatter(present["year"], present["prop_boys"])
plt.plot(present["year"], present["prop_boys"])
plt.xlabel("year")
plt.ylabel("prop_boys")
plt.show()

<IPython.core.display.Javascript object>

In [42]:
%matplotlib notebook

present["prop_boy_girl"] = present["girls"] / present["boys"]

plt.style.use("fivethirtyeight")
plt.scatter(present["year"], present["prop_boy_girl"])
plt.xlabel("year")
plt.ylabel("prop_boy_girl")
plt.show()

<IPython.core.display.Javascript object>

In [39]:
# Sort the dataset in descending order based on the "total" column
present.sort_values("total", inplace=True, ascending=False)
present.head()

Unnamed: 0,year,boys,girls,total,prop_boys,prop_boy_girl
67,2007,2208071,2108162,4316233,0.511574,0.954753
21,1961,2186274,2082052,4268326,0.512209,0.952329
66,2006,2184237,2081318,4265555,0.512064,0.952881
20,1960,2179708,2078142,4257850,0.511927,0.953404
17,1957,2179960,2074824,4254784,0.512355,0.951772
