# Chapter 3: Beginning Data Analysis

## Recipes
* [Developing a data analysis routine](#Developing-a-data-analysis-routine)
* [Reducing memory by changing data types](#Reducing-memory-by-changing-data-types)
* [Selecting the smallest of the largest](#Selecting-the-smallest-of-the-largest)
* [Selecting the largest of each group by sorting](#Selecting-the-largest-of-each-group-by-sorting)
* [Duplicating nlargest with sort_values](#Duplicating-nlargest-with-sort_values)
* [Calculating a trailing stop order price](#Calculating-a-trailing-stop-order-price)

In [360]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50

# Developing a data analysis routine

In [2]:
college = pd.read_csv('data/college.csv')

In [3]:
college.head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


In [4]:
college.shape

(7535, 27)

In [5]:
with pd.option_context('display.max_rows', 8):
    display(college.describe(include=[np.number]).T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0000,0.00000,0.000000,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0000,0.00000,0.000000,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0000,0.00000,0.000000,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0000,0.00000,0.000000,1.0
...,...,...,...,...,...,...,...,...
CURROPER,7535.0,0.923291,0.266146,0.0,1.0000,1.00000,1.000000,1.0
PCTPELL,6849.0,0.530643,0.225544,0.0,0.3578,0.52150,0.712900,1.0
PCTFLOAN,6849.0,0.522211,0.283616,0.0,0.3329,0.58330,0.745000,1.0
UG25ABV,6718.0,0.410021,0.228939,0.0,0.2415,0.40075,0.572275,1.0


In [6]:
college.describe(include=[np.object, pd.Categorical]).T

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Vatterott College-Dividend,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


In [7]:
# used to shorten info output.
import io
buf = io.StringIO()
college.info(buf=buf)
s = buf.getvalue()

In [8]:
s

"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 7535 entries, 0 to 7534\nData columns (total 27 columns):\nINSTNM                7535 non-null object\nCITY                  7535 non-null object\nSTABBR                7535 non-null object\nHBCU                  7164 non-null float64\nMENONLY               7164 non-null float64\nWOMENONLY             7164 non-null float64\nRELAFFIL              7535 non-null int64\nSATVRMID              1185 non-null float64\nSATMTMID              1196 non-null float64\nDISTANCEONLY          7164 non-null float64\nUGDS                  6874 non-null float64\nUGDS_WHITE            6874 non-null float64\nUGDS_BLACK            6874 non-null float64\nUGDS_HISP             6874 non-null float64\nUGDS_ASIAN            6874 non-null float64\nUGDS_AIAN             6874 non-null float64\nUGDS_NHPI             6874 non-null float64\nUGDS_2MOR             6874 non-null float64\nUGDS_NRA              6874 non-null float64\nUGDS_UNKN             6874 non-null flo

In [9]:
s1 = "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 7535 entries, 0 to 7534\nData columns (total 27 columns):\nINSTNM                7535 non-null object\nCITY                  7535 non-null object\nSTABBR                7535 non-null object\nHBCU                  7164 non-null float64\n..."

In [10]:
s2 = "\nPCTFLOAN              6849 non-null float64\nUG25ABV               6718 non-null float64\nMD_EARN_WNE_P10       6413 non-null object\nGRAD_DEBT_MDN_SUPP    7503 non-null object\ndtypes: float64(20), int64(2), object(5)\nmemory usage: 1.6+ MB\n"

In [11]:
print(s1 + s2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
INSTNM                7535 non-null object
CITY                  7535 non-null object
STABBR                7535 non-null object
HBCU                  7164 non-null float64
...
PCTFLOAN              6849 non-null float64
UG25ABV               6718 non-null float64
MD_EARN_WNE_P10       6413 non-null object
GRAD_DEBT_MDN_SUPP    7503 non-null object
dtypes: float64(20), int64(2), object(5)
memory usage: 1.6+ MB



In [12]:
college.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [13]:
college.describe(include=[np.object, pd.Categorical]).T

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Vatterott College-Dividend,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


## There's more...

In [14]:
with pd.option_context('display.max_rows', 5):
    display(college.describe(include=[np.number], 
                 percentiles=[.01, .05, .10, .25, .5, .75, .9, .95, .99]).T)

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0000,0.0000,0.0000,0.0000,0.00000,0.000000,0.00000,0.00000,1.000000,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0000,0.0000,0.0000,0.0000,0.00000,0.000000,0.00000,0.00000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PCTFLOAN,6849.0,0.522211,0.283616,0.0,0.0000,0.0000,0.0000,0.3329,0.58330,0.745000,0.84752,0.89792,0.986368,1.0
UG25ABV,6718.0,0.410021,0.228939,0.0,0.0025,0.0374,0.0899,0.2415,0.40075,0.572275,0.72666,0.80000,0.917383,1.0


In [15]:
college_dd = pd.read_csv('data/college_data_dictionary.csv')

In [16]:
with pd.option_context('display.max_rows', 8):
    display(college_dd)

Unnamed: 0,column_name,description
0,INSTNM,Institution Name
1,CITY,City Location
2,STABBR,State Abbreviation
3,HBCU,Historically Black College or University
...,...,...
23,PCTFLOAN,Percent Students with federal loan
24,UG25ABV,Percent Students Older than 25
25,MD_EARN_WNE_P10,Median Earnings 10 years after enrollment
26,GRAD_DEBT_MDN_SUPP,Median debt of completers


# Reducing memory by changing data types

In [17]:
college['RELAFFIL'] = college['RELAFFIL'].astype(np.int8)

In [18]:
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM']
college[different_cols].dtypes

RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
dtype: object

In [19]:
college[different_cols].memory_usage(deep=True)

Index           80
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660240
dtype: int64

## There's more...

In [41]:
college = pd.read_csv('data/college.csv')

In [42]:
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)

Index           80
CURROPER     60280
INSTNM      660240
dtype: int64

In [43]:
college.loc[0, 'CURROPER'] = 10000000
college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'
# college.loc[1, 'INSTNM'] = college.loc[1, 'INSTNM'] + 'a'
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)

Index           80
CURROPER     60280
INSTNM      660345
dtype: int64

In [44]:
college['MENONLY'].dtype

dtype('float64')

In [45]:
college['MENONLY'].astype('int8') # ValueError: Cannot convert non-finite values (NA or inf) to integer

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [46]:
college.describe(include=['int64', 'float64']).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [47]:
college.describe(include=[np.int64, np.float64]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [48]:
college['RELAFFIL'] = college['RELAFFIL'].astype(np.int8)

In [49]:
college.describe(include=['int', 'float']).T  # defaults to 64 bit int/floats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0
UGDS_HISP,6874.0,0.161635,0.221854,0.0,0.0276,0.0714,0.198875,1.0


In [50]:
college.describe(include=['number']).T  # also works as the default int/float are 64 bits

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [51]:
college['MENONLY'] = college['MENONLY'].astype('float16')
college['RELAFFIL'] = college['RELAFFIL'].astype('int8')

In [54]:
college.index = pd.Int64Index(college.index)
college.index.memory_usage()

60280

# Selecting the smallest of the largest

In [740]:
movie = pd.read_csv('data/imdb5000.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens,7.1,


In [741]:
movie2.nlargest(100, 'imdb_score').head()

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0


In [742]:
movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


# Selecting the largest of each group by sorting

In [437]:
movie = pd.read_csv('data/imdb5000.csv')
movie2 = movie[['movie_title', 'title_year', 'imdb_score']]

In [454]:
movie2.sort_values('title_year', ascending=False).head()

Unnamed: 0,movie_title,title_year,imdb_score
2366,Fight Valley,2016.0,5.0
3817,Yoga Hosers,2016.0,4.8
1367,The 5th Wave,2016.0,5.2
1742,The Boss,2016.0,5.3
519,The Secret Life of Pets,2016.0,6.8


In [458]:
movie3 = movie2.sort_values(['title_year','imdb_score'], ascending=False)
movie3.head()

Unnamed: 0,movie_title,title_year,imdb_score
4409,Kickboxer: Vengeance,2016.0,9.1
4372,A Beginner's Guide to Snuff,2016.0,8.7
3870,Airlift,2016.0,8.5
27,Captain America: Civil War,2016.0,8.2
98,Godzilla Resurgence,2016.0,8.2


In [460]:
movie_top_year = movie3.drop_duplicates(subset='title_year')
movie_top_year.head()

Unnamed: 0,movie_title,title_year,imdb_score
4409,Kickboxer: Vengeance,2016.0,9.1
3816,Running Forever,2015.0,8.6
4468,Queen of the Mountains,2014.0,8.7
4017,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5


In [471]:
movie4 = movie[['movie_title', 'title_year', 'content_rating', 'budget']]
movie4_sorted = movie4.sort_values(['title_year', 'content_rating', 'budget'], 
                                   ascending=[False, False, True])
movie4_sorted.drop_duplicates(subset=['title_year', 'content_rating']).head(10)

Unnamed: 0,movie_title,title_year,content_rating,budget
4108,Compadres,2016.0,R,3000000.0
4772,Fight to the Finish,2016.0,PG-13,150000.0
4775,Rodeo Girl,2016.0,PG,500000.0
3309,The Wailing,2016.0,Not Rated,
4773,Alleluia! The Devil's Carnival,2016.0,,500000.0
4848,Bizarre,2015.0,Unrated,500000.0
821,The Ridiculous 6,2015.0,TV-14,
4956,The Gallows,2015.0,R,100000.0
4948,Romantic Schemer,2015.0,PG-13,125000.0
3868,R.L. Stine's Monsterville: The Cabinet of Souls,2015.0,PG,4400000.0


# Duplicating nlargest with sort_values

In [752]:
movie = pd.read_csv('data/imdb5000.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie_smallest_largest = movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')
movie_smallest_largest

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [756]:
movie2.sort_values('imdb_score', ascending=False).head(100).head()

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0


In [759]:
movie2.sort_values('imdb_score', ascending=False).head(100).sort_values('budget').head()

Unnamed: 0,movie_title,imdb_score,budget
4815,A Charlie Brown Christmas,8.4,150000.0
4801,Children of Heaven,8.5,180000.0
4804,Butterfly Girl,8.7,180000.0
4706,12 Angry Men,8.9,350000.0
4636,The Other Dream Team,8.4,500000.0


In [763]:
movie2.nlargest(100, 'imdb_score').tail()

Unnamed: 0,movie_title,imdb_score,budget
4023,Oldboy,8.4,3000000.0
4163,To Kill a Mockingbird,8.4,2000000.0
4395,Reservoir Dogs,8.4,1200000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [765]:
movie2.sort_values('imdb_score', ascending=False).head(100).tail()

Unnamed: 0,movie_title,imdb_score,budget
3799,Anne of Green Gables,8.4,
3777,Requiem for a Dream,8.4,4500000.0
3935,"Batman: The Dark Knight Returns, Part 2",8.4,3500000.0
4636,The Other Dream Team,8.4,500000.0
2455,Aliens,8.4,18500000.0


# Calculating a trailing stop order price

In [639]:
import pandas_datareader as pdr

In [671]:
tsla = pdr.DataReader('tsla', data_source='google',start='2017-1-1')
tsla.head(8)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-03,214.86,220.33,210.96,216.99,5923254
2017-01-04,214.75,228.0,214.31,226.99,11213471
2017-01-05,226.42,227.48,221.95,226.75,5911695
2017-01-06,226.93,230.31,225.45,229.01,5527893
2017-01-09,228.97,231.92,228.0,231.28,3979484
2017-01-10,232.0,232.0,226.89,229.87,3659955
2017-01-11,229.07,229.98,226.68,229.73,3650825
2017-01-12,229.06,230.7,225.58,229.59,3790229


In [668]:
tsla_close = tsla['Close']

In [672]:
tsla_cummax = tsla_close.cummax()
tsla_cummax.head(8)

Date
2017-01-03    216.99
2017-01-04    226.99
2017-01-05    226.99
2017-01-06    229.01
2017-01-09    231.28
2017-01-10    231.28
2017-01-11    231.28
2017-01-12    231.28
Name: Close, dtype: float64

In [674]:
tsla_trailing_stop = tsla_cummax * .9
tsla_trailing_stop.head(8)

Date
2017-01-03    195.291
2017-01-04    204.291
2017-01-05    204.291
2017-01-06    206.109
2017-01-09    208.152
2017-01-10    208.152
2017-01-11    208.152
2017-01-12    208.152
Name: Close, dtype: float64

## There's more...

In [678]:
def set_trailing_loss(symbol, purchase_date, perc):
    close = pdr.DataReader(symbol, 'google', start=purchase_date)['Close']
    return close.cummax() * perc

In [679]:
msft_trailing_stop = set_trailing_loss('msft', '2017-6-1', .85)
msft_trailing_stop.head()

Date
2017-06-01    59.585
2017-06-02    60.996
2017-06-05    61.438
2017-06-06    61.642
2017-06-07    61.642
Name: Close, dtype: float64