Created by Lisa J Cohen (ljcohen) and David Yakobovitch (davidyakobovitch)

Edits and annotations by Sabah Ul-Hasan (sul-hasan)

In [12]:
# In this example, we are installing pandas as a tool for doing data analysis in python
# If pandas and nimpy successfully installed, then nothing will output when command executed.
import pandas as pd

# Step 1: Uploading and viewing the data

In [13]:
# Upload your data from the home computer
# Ensure you have the correct path listed
surveys_df = pd.read_csv("/Users/sul-hasan/Desktop/ToOrganize/DatASci/DC_Oct26_SUH/DC_Oct27_GitHubFolder/surveys.csv")

In [14]:
# What are the columns in the data frame?
surveys_df.columns

Index([u'record_id', u'month', u'day', u'year', u'plot_id', u'species_id',
       u'sex', u'hindfoot_length', u'weight'],
      dtype='object')

In [30]:
# How can we quickly view these columns? 
surveys_df.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,


In [31]:
# We can see the first 5 rows don't have any weights listed
# We can specficy a number of rows we'd like to see such that we begin to see some weight values
surveys_df.head(100)

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,
5,6,7,16,1977,1,PF,M,14.0,
6,7,7,16,1977,2,PE,F,,
7,8,7,16,1977,1,DM,M,37.0,
8,9,7,16,1977,1,DM,F,34.0,
9,10,7,16,1977,6,PF,F,20.0,


In [15]:
# What are the unique years in the data frame?
pd.unique(surveys_df['year'])

array([1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987,
       1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
       1999, 2000, 2001, 2002])

In [16]:
# How many years are unique in the data frame? 
# Use the len function to count
len(pd.unique(surveys_df['year']))

26

In [17]:
# What are the metrics of the weight column? (we can use this approach for any column)
surveys_df['weight'].describe()

count    32283.000000
mean        42.672428
std         36.631259
min          4.000000
25%         20.000000
50%         37.000000
75%         48.000000
max        280.000000
Name: weight, dtype: float64

In [21]:
# How can we only output the mean of the weight column?
surveys_df['weight'].mean()

42.672428212991356

In [22]:
# How can we only output the standard deviation of the weight column?
surveys_df['weight'].std()

36.63125947458399

# Step 2: Assessing general patterns by group 

In [18]:
# What if we want to create groups in the data frame by a column?
# Let's try the sex column as an example (group females together and group males together)
grouped_surveys = surveys_df.groupby('sex')

In [23]:
# Now that we've grouped the data, what is the mean for each column (females vs. males)?
grouped_surveys.mean()

Unnamed: 0_level_0,record_id,month,day,year,plot_id,hindfoot_length,weight
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,18036.412046,6.583047,16.007138,1990.644997,11.440854,28.83678,42.170555
M,17754.835601,6.392668,16.184286,1990.480401,11.098282,29.709578,42.995379


In [24]:
# What happenes when we group the data by two categories? 
grouped_surveys2 = surveys_df.groupby(['plot_id','sex'])

In [26]:
# Let's see what all the metrics look like (and not only the mean)
grouped_surveys2.describe().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,day,day,day,day,day,day,day,day,hindfoot_length,hindfoot_length,...,weight,weight,year,year,year,year,year,year,year,year
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
plot_id,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,F,848.0,15.338443,8.325993,1.0,9.0,15.0,22.0,31.0,808.0,31.733911,...,50.0,196.0,848.0,1990.933962,7.678171,1977.0,1983.0,1991.0,1998.0,2002.0
1,M,1095.0,15.905936,8.053257,1.0,9.0,16.0,23.0,31.0,1047.0,34.30277,...,54.0,231.0,1095.0,1990.091324,7.265208,1977.0,1983.0,1990.0,1997.0,2002.0
2,F,970.0,16.28866,8.046509,1.0,10.0,16.0,23.0,31.0,918.0,30.16122,...,51.0,274.0,970.0,1990.449485,7.51991,1977.0,1984.0,1990.0,1997.0,2002.0
2,M,1144.0,15.440559,8.414667,1.0,9.0,15.0,23.0,31.0,1077.0,30.35376,...,50.0,278.0,1144.0,1990.756119,7.714444,1977.0,1983.0,1991.0,1998.0,2002.0
3,F,893.0,16.161254,7.961706,1.0,9.0,16.0,23.0,31.0,863.0,23.774044,...,34.0,199.0,893.0,1992.013438,6.811511,1977.0,1987.0,1992.0,1998.0,2002.0


# Step 3: Data visualization 

In [33]:
# We will use Matplot
# Let's first verify matplot is working (if all is well, nothing will show when executed)
%matplotlib inline