In [1]:
#loads the package pandas into this notebook environment
#uses 'pd' as an abbrevation for the package

import pandas as pd

In [2]:
#imports the data as a dataframe into this notebook environment

bellevue_df = pd.read_csv("../datasets/bellevue_almshouse_modified.csv")

#sets the maximum number of row pandas will display as output
pd.options.display.max_rows = 20

In [5]:
#information about the dataframe: the names of the variables (columns),
#how many non-n/a and non-blank items, and the data-type of the variables (columns)
#object = strings, float64 = float, int64 = integer, date-time64 = date-time

bellevue_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9584 entries, 0 to 9583
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date_in     9584 non-null   object 
 1   first_name  9580 non-null   object 
 2   last_name   9584 non-null   object 
 3   age         9534 non-null   float64
 4   disease     6497 non-null   object 
 5   profession  8565 non-null   object 
 6   gender      9584 non-null   object 
 7   children    37 non-null     object 
dtypes: float64(1), object(7)
memory usage: 599.1+ KB


In [3]:
#looks at the first 10 rows of the dataframe

bellevue_df.head(10)

Unnamed: 0,date_in,first_name,last_name,age,disease,profession,gender,children
0,1847-04-17,Mary,Gallagher,28.0,recent emigrant,married,w,Child Alana 10 days
1,1847-04-08,John,Sanin (?),19.0,recent emigrant,laborer,m,Catherine 2 mo
2,1847-04-17,Anthony,Clark,60.0,recent emigrant,laborer,m,Charles Riley afed 10 days
3,1847-04-08,Lawrence,Feeney,32.0,recent emigrant,laborer,m,Child
4,1847-04-13,Henry,Joyce,21.0,recent emigrant,,m,Child 1 mo
5,1847-04-14,Bridget,Hart,20.0,recent emigrant,spinster,w,Child
6,1847-04-14,Mary,Green,40.0,recent emigrant,spinster,w,And child 2 months
7,1847-04-19,Daniel,Loftus,27.0,destitution,laborer,m,
8,1847-04-10,James,Day,35.0,recent emigrant,laborer,m,
9,1847-04-10,Margaret,Farrell,30.0,recent emigrant,widow,w,


In [4]:
#looks at a random 10 row sample from the dataframe

bellevue_df.sample(10)

Unnamed: 0,date_in,first_name,last_name,age,disease,profession,gender,children
7629,1847-12-09,Dennis,Sullivan,49.0,ophthalmia,laborer,m,
5967,1847-07-16,Archibald,Johnston,25.0,sickness,tailor,m,
4162,1846-08-21,Sarah,Brady,56.0,,widow,w,
6049,1847-07-22,Mary,Diffen,32.0,pregnant,widow,w,
2670,1846-03-03,Margaret,Devine,25.0,,spinster,w,
8527,1847-06-17,Henry,Hays,37.0,erysipelas,engraver,m,
7233,1847-10-30,Bridget,Odee,33.0,destitution,married,w,
7286,1847-11-03,Ann,McCluskey,35.0,sickness,widow,w,
2754,1846-03-23,Sarah,Louza,34.0,,married,w,
1934,1847-04-24,Edward,Gormley,30.0,typhus,laborer,m,


In [6]:
#transforms the original datatype to a date-time datatype

bellevue_int = bellevue_df['date_in']
bellevue_df['date_in'] = pd.to_datetime(bellevue_int, format='%Y-%m-%d')
bellevue_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9584 entries, 0 to 9583
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date_in     9584 non-null   datetime64[ns]
 1   first_name  9580 non-null   object        
 2   last_name   9584 non-null   object        
 3   age         9534 non-null   float64       
 4   disease     6497 non-null   object        
 5   profession  8565 non-null   object        
 6   gender      9584 non-null   object        
 7   children    37 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(6)
memory usage: 599.1+ KB


In [7]:
#calculate summary statistics of all numerical values
#summary statistics: measures of central tendency (e.g. average) and measures of dispersion (how wide the spread of the data is)

bellevue_df.describe()

Unnamed: 0,date_in,age
count,9584,9534.0
mean,1847-03-02 22:38:42.871452160,30.332604
min,1846-01-01 00:00:00,0.08
25%,1846-10-10 00:00:00,21.0
50%,1847-04-21 00:00:00,28.0
75%,1847-07-09 00:00:00,39.0
max,1847-12-31 00:00:00,97.0
std,,14.179608


In [8]:
#calculate summary statistics for all variables

bellevue_df.describe(include='all')

Unnamed: 0,date_in,first_name,last_name,age,disease,profession,gender,children
count,9584,9580,9584,9534.0,6497,8565,9584,37
unique,,523,3142,,75,172,5,36
top,,Mary,Kelly,,sickness,laborer,m,Child
freq,,979,137,,2706,3108,4958,2
mean,1847-03-02 22:38:42.871452160,,,30.332604,,,,
min,1846-01-01 00:00:00,,,0.08,,,,
25%,1846-10-10 00:00:00,,,21.0,,,,
50%,1847-04-21 00:00:00,,,28.0,,,,
75%,1847-07-09 00:00:00,,,39.0,,,,
max,1847-12-31 00:00:00,,,97.0,,,,


In [9]:
#creates a True/False dataframe for all duplicated rows
bellevue_df.duplicated(keep=False)

#compares the True/False dataframe against the original dataframe and 
#presents the duplicated rows as the output
bellevue_df[bellevue_df.duplicated(keep=False)]

Unnamed: 0,date_in,first_name,last_name,age,disease,profession,gender,children
156,1847-05-25,Ann,Maher,23.0,recent emigrant,married,w,
157,1847-05-25,Ann,Maher,23.0,recent emigrant,married,w,
520,1847-05-20,Nancy,Moran,37.0,recent emigrant,spinster,w,
2132,1847-03-20,Simon,Donally,17.0,recent emigrant,laborer,m,
2133,1847-03-20,Simon,Donally,17.0,recent emigrant,laborer,m,
...,...,...,...,...,...,...,...,...
5298,1847-03-19,Michael,McGowen,38.0,destitution,laborer,m,
5299,1847-03-19,Michael,McGowen,38.0,destitution,laborer,m,
5300,1847-03-19,Michael,McGowen,38.0,destitution,laborer,m,
5301,1847-03-19,Michael,McGowen,38.0,destitution,laborer,m,


In [10]:
#drops the duplicated row. keeps the first row as the unique value.

bellevue_df = bellevue_df.drop_duplicates(keep='first')

#checks again if all the duplicated rows are dropped
bellevue_df[bellevue_df.duplicated(keep=False)]

Unnamed: 0,date_in,first_name,last_name,age,disease,profession,gender,children


In [18]:
#checks the columns in the dataframe

bellevue_df.columns

Index(['admission', 'first_name', 'last_name', 'age', 'disease', 'profession',
       'gender', 'children'],
      dtype='object')

In [12]:
#selects the column from the dataset with 2 []

bellevue_df[['disease']]

Unnamed: 0,disease
0,recent emigrant
1,recent emigrant
2,recent emigrant
3,recent emigrant
4,recent emigrant
...,...
9579,
9580,lame
9581,
9582,


In [13]:
#can also select multiple columns from the dataset with 2 []

bellevue_df[['disease','profession','gender']]

Unnamed: 0,disease,profession,gender
0,recent emigrant,married,w
1,recent emigrant,laborer,m
2,recent emigrant,laborer,m
3,recent emigrant,laborer,m
4,recent emigrant,,m
...,...,...,...
9579,,,w
9580,lame,superintendent,m
9581,,,m
9582,,,w


In [14]:
#renames the old column name with the new column name

bellevue_df = bellevue_df.rename(columns={'date_in': 'admission'})
bellevue_df.columns

Index(['admission', 'first_name', 'last_name', 'age', 'disease', 'profession',
       'gender', 'children'],
      dtype='object')

In [15]:
#counts the total frequency of an item in a column 

bellevue_df['disease'].value_counts()

disease
sickness           2703
recent emigrant    1965
destitution         834
fever               192
insane              138
                   ... 
orchitis              1
del femur             1
throat cut            1
ague                  1
asthma                1
Name: count, Length: 75, dtype: int64

In [20]:
#counts and displays the first 10 by slicing of an item in a column 

bellevue_df['disease'].value_counts()[:10]

disease
sickness           2703
recent emigrant    1965
destitution         834
fever               192
insane              138
pregnant            134
sore                 79
intemperance         71
illegible            47
typhus               46
Name: count, dtype: int64