In [1]:
import pandas as pd

In [2]:
import numpy as np

### Question: When reading from a file, how to read in only a subset of the columns?

In [3]:
# read a dataset of UFO reports into a DataFrame, and check the columns

ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

In [4]:
# specify which columns to include by name

ufo = pd.read_csv('http://bit.ly/uforeports', usecols=['City', 'State'])

In [5]:
# or equivalently, specify columns by position

ufo = pd.read_csv('http://bit.ly/uforeports', usecols=[0, 4])
ufo.columns

Index(['City', 'Time'], dtype='object')

### Question: When reading from a file, how to read in only a subset of the rows?

In [6]:
# specify how many rows to read

ufo = pd.read_csv('http://bit.ly/uforeports', nrows=3)
ufo

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


### Question: How to iterate through a Series?

In [7]:
# Series are directly iterable (like a list)

for c in ufo.City:
    print(c)

Ithaca
Willingboro
Holyoke


### Question: How to iterate through a DataFrame?

In [8]:
# various methods are available to iterate through a DataFrame
# to iterate through a dataframe we need to use 'iterrows()'

for index, row in ufo.iterrows():
    print(index, row.City, row.State)

0 Ithaca NY
1 Willingboro NJ
2 Holyoke CO


### Question: How to drop all non-numeric columns from a DataFrame?

In [9]:
# read a dataset of alcohol consumption into a DataFrame, and check the data types

drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [10]:
# only include numeric columns in the DataFrame
# 1st we need to import numpy
# then use a dataframe method 'select_dtypes()' with argument as 'include=[np.number]'

drinks.select_dtypes(include=[np.number]).dtypes

beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
dtype: object

### Question: How to know whether to pass an argument as a string or a list?

In [11]:
# describe all of the numeric columns
# by default describe() describe only the numeric columns

drinks.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [12]:
# pass the string 'all' to describe all columns

drinks.describe(include='all')

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
count,193,193.0,193.0,193.0,193.0,193
unique,193,,,,,6
top,Afghanistan,,,,,Africa
freq,1,,,,,53
mean,,106.160622,80.994819,49.450777,4.717098,
std,,101.143103,88.284312,79.697598,3.773298,
min,,0.0,0.0,0.0,0.0,
25%,,20.0,4.0,1.0,1.3,
50%,,76.0,56.0,8.0,4.2,
75%,,188.0,128.0,59.0,7.2,


In [13]:
# pass a list of data types to only describe certain types

drinks.describe(include=['object', 'float64'])

Unnamed: 0,country,total_litres_of_pure_alcohol,continent
count,193,193.0,193
unique,193,,6
top,Afghanistan,,Africa
freq,1,,53
mean,,4.717098,
std,,3.773298,
min,,0.0,
25%,,1.3,
50%,,4.2,
75%,,7.2,


In [14]:
# pass a list even if you only want to describe a single data type

drinks.describe(include=['object'])

Unnamed: 0,country,continent
count,193,193
unique,193,6
top,Afghanistan,Africa
freq,1,53


### Question: What is the difference between `ufo.isnull()` and `pd.isnull(ufo)`?

In [15]:
# read a dataset of UFO reports into a DataFrame

ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [16]:
# use 'isnull' as a top-level function

pd.isnull(ufo).head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,False,True,False,False,False
1,False,True,False,False,False
2,False,True,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False


In [17]:
# equivalent: use 'isnull' as a DataFrame method

ufo.isnull().head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,False,True,False,False,False
1,False,True,False,False,False
2,False,True,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False


### Question: Why are DataFrame slices inclusive when using `.loc`, but exclusive when using `.iloc`?

In [18]:
# label-based slicing is inclusive of the start and stop

ufo.loc[0:4, :]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [19]:
# position-based slicing is inclusive of the start and exclusive of the stop

ufo.iloc[0:4, :]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00


In [20]:
# 'iloc' is simply following NumPy's slicing convention...

ufo.values[0:4, :]

array([['Ithaca', nan, 'TRIANGLE', 'NY', '6/1/1930 22:00'],
       ['Willingboro', nan, 'OTHER', 'NJ', '6/30/1930 20:00'],
       ['Holyoke', nan, 'OVAL', 'CO', '2/15/1931 14:00'],
       ['Abilene', nan, 'DISK', 'KS', '6/1/1931 13:00']], dtype=object)

In [21]:
# ...and NumPy is simply following Python's slicing convention

'python'[0:4]

'pyth'

In [22]:
# 'loc' is inclusive of the stopping label because you don't necessarily know what label will come after it

ufo.loc[0:4, 'City':'State']

Unnamed: 0,City,Colors Reported,Shape Reported,State
0,Ithaca,,TRIANGLE,NY
1,Willingboro,,OTHER,NJ
2,Holyoke,,OVAL,CO
3,Abilene,,DISK,KS
4,New York Worlds Fair,,LIGHT,NY


### Question: How to randomly sample rows from a DataFrame?

In [23]:
# sample 3 rows from the DataFrame without replacement (new in pandas 0.16.1)

ufo.sample(n=3)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
15094,Bolivar,RED,LIGHT,TN,11/14/1999 2:06
13305,Charlotte,,OTHER,NC,4/10/1999 19:30
7002,Indianapolis,,CIRCLE,IN,4/13/1993 22:30


In [24]:
# use the 'random_state' parameter for reproducibility

ufo.sample(n=3, random_state=42)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
217,Norridgewock,,DISK,ME,9/15/1952 14:00
12282,Ipava,,TRIANGLE,IL,10/1/1998 21:15
17933,Ellinwood,,FIREBALL,KS,11/13/2000 22:00


In [25]:
# sample 75% of the DataFrame's rows without replacement

train = ufo.sample(frac=0.75, random_state=99)

In [27]:
train

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
6250,Sunnyvale,,OTHER,CA,12/16/1989 0:00
8656,Corpus Christi,,,TX,9/13/1995 0:10
2729,Mentor,,DISK,OH,8/8/1974 10:00
7348,Wilson,,LIGHT,WI,6/1/1994 1:00
12637,Lowell,,CIRCLE,MA,11/26/1998 10:00
...,...,...,...,...,...
8965,Lynnwood,,,WA,12/6/1995 22:45
4991,Kent,,,WA,12/5/1983 5:00
2740,Niagara Falls,,TRIANGLE,NY,8/15/1974 20:00
11887,Vancouver,,TRIANGLE,WA,7/25/1998 21:00


In [26]:
# store the remaining 25% of the rows in another DataFrame

test = ufo.loc[~ufo.index.isin(train.index), :]

In [28]:
test

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
5,Valley City,,DISK,ND,9/15/1934 15:30
8,Eklutna,,CIGAR,AK,10/15/1936 17:00
11,Waterloo,,FIREBALL,AL,6/1/1939 20:00
13,Keokuk,,OVAL,IA,7/7/1939 2:00
...,...,...,...,...,...
18227,San Francisco,,TRIANGLE,CA,12/30/2000 22:00
18233,Anchorage,RED,VARIOUS,AK,12/31/2000 21:00
18234,Capitola,,TRIANGLE,CA,12/31/2000 22:00
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00
