In [1]:
# Load the comma-separated data from https://query.data.world/s/
# wsjbxdqhw6z6izgdxijv5p2lfqh7gx into a DataFrame ‘.read csv()‘

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

DATA = "https://query.data.world/s/wsjbxdqhw6z6izgdxijv5p2lfqh7gx"
df = pd.read_csv(DATA)

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
# Inspect the DataFrame using .info() and with .info(memory\_usage="deep").
# What is the difference between the two calls? How much space does the 
# DataFrame require in memory?

df.info()
df.info(memory_usage="deep")

# second info shows much larger memory usage 
# default: memory estimation based in column dtype and number of 
    # rows assuming values consume the same memory amount for 
    # corresponding dtypes. 
# deep: real memory usage calculation is performed, 
    # cost of computational resources
# Memory space required: 861.6 MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 211.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 861.6 MB


In [3]:
# Create a copy of the object with only columns of type object by using
# .select\_dtypes (include=['object']).

df_obj = df.select_dtypes(include=['object'])
df_obj.head()

Unnamed: 0,day_of_week,v_name,v_league,h_name,h_league,day_night,completion,forefeit,protest,park_id,...,h_player_6_id,h_player_6_name,h_player_7_id,h_player_7_name,h_player_8_id,h_player_8_name,h_player_9_id,h_player_9_name,additional_info,acquisition_info
0,Thu,CL1,na,FW1,na,D,,,,FOR01,...,caret101,Tom Carey,mince101,Ed Mincher,mcdej101,James McDermott,kellb105,Bill Kelly,,Y
1,Fri,BS1,na,WS3,na,D,,,,WAS01,...,leona101,Andy Leonard,braia102,Asa Brainard,burrh101,Henry Burroughs,berth101,Henry Berthrong,HTBF,Y
2,Sat,CL1,na,RC1,na,D,,,,RCK01,...,ansoc101,Cap Anson,sagep101,Pony Sager,birdg101,George Bird,stirg101,Gat Stires,,Y
3,Mon,CL1,na,CH1,na,D,,,,CHI01,...,folet101,Tom Foley,duffe101,Ed Duffy,pinke101,Ed Pinkham,zettg101,George Zettlein,,Y
4,Tue,BS1,na,TRO,na,D,,,,TRO01,...,beave101,Edward Beavens,bells101,Steve Bellan,pikel101,Lip Pike,cravb101,Bill Craver,HTBF,Y


In [5]:
# Look at the summary of this object new (using .describe()). Which 
# columns have very few unique values compared to the number of 
# observations?

df_obj.describe()

Unnamed: 0,day_of_week,v_name,v_league,h_name,h_league,day_night,completion,forefeit,protest,park_id,...,h_player_6_id,h_player_6_name,h_player_7_id,h_player_7_name,h_player_8_id,h_player_8_name,h_player_9_id,h_player_9_name,additional_info,acquisition_info
count,171907,171907,171907,171907,171907,140150,116,145,180,171907,...,140838,140838,140838,140838,140838,140838,140838,140838,1456,140841
unique,7,148,7,148,7,2,116,3,5,245,...,4774,4720,5253,5197,4760,4710,5193,5142,332,1
top,Sat,CHN,NL,CHN,NL,D,"20090709,HOU03,10,10,64",H,V,STL07,...,grimc101,Charlie Grimm,grimc101,Charlie Grimm,lopea102,Al Lopez,spahw101,Warren Spahn,HTBF,Y
freq,28891,8870,88866,9024,88867,82724,1,69,90,7022,...,427,427,491,491,676,676,339,339,1112,140841


In [6]:
# Does it make sense to convert a column of type object to type category
# if more than 50% of the observations contain unique values? Why/Why not?

columns_few = []

for col in df_obj.columns:
    if df_obj[col].nunique()/df_obj[col].count() <= 0.05:
        columns_few.append(col)

columns_few

type(columns_few)

# Converting a column to type category only makes sense when the feature 
# takes on very few unique values, in that case it saves memory 
# significantly if >50% of values are unique, category is not unseful.

list

In [9]:
# Convert all columns of type object to type category where you deem this 
# appropriate.

# own criterion: if <10 unique values
for i in range(len(columns_few)):
    df_obj[columns_few[i]] = df_obj[columns_few[i]].astype('category')





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
# What is the final size in memory?

# memory usage: 97.3 MB -> significant reduction
df_obj.info(memory_usage="deep") 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Data columns (total 78 columns):
day_of_week                   171907 non-null category
v_name                        171907 non-null category
v_league                      171907 non-null category
h_name                        171907 non-null category
h_league                      171907 non-null category
day_night                     140150 non-null category
completion                    116 non-null object
forefeit                      145 non-null category
protest                       180 non-null category
park_id                       171907 non-null category
v_line_score                  147271 non-null object
h_line_score                  147271 non-null object
hp_umpire_id                  171888 non-null category
hp_umpire_name                171891 non-null category
1b_umpire_id                  147040 non-null category
1b_umpire_name                171891 non-null category
2b_umpire_id            

In [None]:
# Could above routine have speeded up somewhere? Hint: Look at the 
# documentation for .read csv().

# Speed up routine
# use dtype when reading the csv, change type to category

