In [3]:
import pandas as pd

# When reading from a file, how to read in only a subset of the columns?

In [4]:
ufo = pd.read_csv("data/ufo.csv")

In [5]:
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

In [6]:
# specify which columns to include by name
ufo = pd.read_csv('data/ufo.csv', usecols=['City', 'State'])
ufo.columns

Index(['City', 'State'], dtype='object')

In [7]:
ufo.head()

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY


In [None]:
# or equivalently, specify columns by position
ufo = pd.read_csv('data/ufo.csv', usecols=[0, 4])
ufo.columns

# When reading from a file, how to read only a subset of the rows?

In [2]:
# specify how many rows to read
ufo = pd.read_csv('data/ufo.csv', nrows=3)
ufo

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


# How to iterate through a Series/Column?

In [3]:
# Series are directly iterable (like a list)
for c in ufo.City:
    print(c)

Ithaca
Willingboro
Holyoke


# How to iterate through a DataFrame?

In [None]:
# various methods are available to iterate through a DataFrame
for index, row in ufo.iterrows():
    print(index, row.City, row.State)

# How to know whether I should pass an argument as a string or a list?

In [8]:
drinks = pd.read_csv("data/drinks.csv")

In [9]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [10]:
# describe all of the numeric columns
drinks.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [11]:
# pass the string 'all' to describe all columns
drinks.describe(include='all')

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
count,193,193.0,193.0,193.0,193.0,193
unique,193,,,,,6
top,Mozambique,,,,,Africa
freq,1,,,,,53
mean,,106.160622,80.994819,49.450777,4.717098,
std,,101.143103,88.284312,79.697598,3.773298,
min,,0.0,0.0,0.0,0.0,
25%,,20.0,4.0,1.0,1.3,
50%,,76.0,56.0,8.0,4.2,
75%,,188.0,128.0,59.0,7.2,


In [12]:
# pass a list of data types to only describe certain types
drinks.describe(include=['object', 'float64'])

Unnamed: 0,country,total_litres_of_pure_alcohol,continent
count,193,193.0,193
unique,193,,6
top,Mozambique,,Africa
freq,1,,53
mean,,4.717098,
std,,3.773298,
min,,0.0,
25%,,1.3,
50%,,4.2,
75%,,7.2,


In [13]:
# pass a list even if you only want to describe a single data type
drinks.describe(include=['object'])

Unnamed: 0,country,continent
count,193,193
unique,193,6
top,Mozambique,Africa
freq,1,53


# How to use the "axis" parameter in pandas?

In [14]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [15]:
# drop a column (temporarily)
drinks.drop('continent', axis=1).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,Afghanistan,0,0,0,0.0
1,Albania,89,132,54,4.9
2,Algeria,25,0,14,0.7
3,Andorra,245,138,312,12.4
4,Angola,217,57,45,5.9


In [16]:
# drop a row (temporarily)
drinks.drop(2, axis=0).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa
5,Antigua & Barbuda,102,128,45,4.9,North America


When **referring to rows or columns** with the axis parameter:

- **axis 0** refers to rows
- **axis 1** refers to columns

In [17]:
# calculate the mean of each numeric column
drinks.mean()

beer_servings                   106.160622
spirit_servings                  80.994819
wine_servings                    49.450777
total_litres_of_pure_alcohol      4.717098
dtype: float64

In [18]:
# or equivalently, specify the axis explicitly
drinks.mean(axis=0)

beer_servings                   106.160622
spirit_servings                  80.994819
wine_servings                    49.450777
total_litres_of_pure_alcohol      4.717098
dtype: float64

In [19]:
# calculate the mean of each row
drinks.mean(axis=1).head()

0      0.000
1     69.975
2      9.925
3    176.850
4     81.225
dtype: float64

When performing a **mathematical operation** with the axis parameter:

- **axis 0** means the operation should "move down" the row axis
- **axis 1** means the operation should "move across" the column axis

In [20]:
# 'index' is an alias for axis 0
drinks.mean(axis='index')

beer_servings                   106.160622
spirit_servings                  80.994819
wine_servings                    49.450777
total_litres_of_pure_alcohol      4.717098
dtype: float64

In [21]:
# 'columns' is an alias for axis 1
drinks.mean(axis='columns').head()

0      0.000
1     69.975
2      9.925
3    176.850
4     81.225
dtype: float64

# How to use string methods in pandas?

In [None]:
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('data/chipotle.tsv')
orders.head()

In [None]:
# normal way to access string methods in Python
'hello'.upper()

In [None]:
# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()

In [None]:
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()

In [None]:
# use the boolean Series to filter the DataFrame
orders[orders.item_name.str.contains('Chicken')].head()

# How to change the data type of a Series?

In [None]:
drinks.head()

In [None]:
# examine the data type of each Series
drinks.dtypes

In [None]:
drinks.beer_servings.astype(float)

In [None]:
# change the data type of an existing Series
drinks['beer_servings'] = drinks.beer_servings.astype(float)
drinks.dtypes

In [5]:
# alternatively, change the data type of a Series while reading in a file
drinks = pd.read_csv('data/drinks.csv', dtype={'beer_servings':float})
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [None]:
orders.head()

In [None]:
# examine the data type of each Series
orders.dtypes

In [None]:
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()

In [None]:
# convert a boolean Series to an integer (False = 0, True = 1)
orders.item_name.str.contains('Chicken').astype(int).head()

# groupby

In [6]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0.0,0,0,0.0,Asia
1,Albania,89.0,132,54,4.9,Europe
2,Algeria,25.0,0,14,0.7,Africa
3,Andorra,245.0,138,312,12.4,Europe
4,Angola,217.0,57,45,5.9,Africa


In [7]:
# calculate the mean beer servings across the entire dataset
drinks.beer_servings.mean()

106.16062176165804

In [8]:
drinks.continent=='Africa'

0      False
1      False
2       True
3      False
4       True
       ...  
188    False
189    False
190    False
191     True
192     True
Name: continent, Length: 193, dtype: bool

In [None]:
drinks[drinks.continent=='Africa'].beer_servings

In [11]:
# calculate the mean beer servings just for countries in Africa
drinks[drinks.continent=='Africa'].beer_servings.mean()

61.471698113207545

In [12]:
# calculate the mean beer servings for each continent
drinks.groupby('continent').beer_servings.mean()

continent
Africa            61.471698
Asia              37.045455
Europe           193.777778
North America    145.434783
Oceania           89.687500
South America    175.083333
Name: beer_servings, dtype: float64

In [None]:
# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby('continent').beer_servings.max()

In [None]:
# multiple aggregation functions can be applied simultaneously
drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])

In [None]:
# specifying a column to which the aggregation function should be applied is not required
drinks.groupby('continent').mean()

In [None]:
# allow plots to appear in the notebook
%matplotlib inline

In [None]:
# side-by-side bar plot of the DataFrame directly above
drinks.groupby('continent').mean().plot(kind='bar')