### This is a short introduction to pandas, geared mainly for new users. You can see more complex recipes in the [Cookbook](https://pandas.pydata.org/pandas-docs/stable/cookbook.html#cookbook)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Object Creation
Creating a Series by passing a list of values, letting pandas create a default integer index:

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a [DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html#pandas.DataFrame) by passing a numpy array, with a datetime index and labeled columns:

In [3]:
# generate a list of dates
dates = pd.date_range('2014/11/01', periods=6)
dates

DatetimeIndex(['2014-11-01', '2014-11-02', '2014-11-03', '2014-11-04',
               '2014-11-05', '2014-11-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
data = np.random.randn(6,4)
data

array([[-0.76602201, -0.31026075,  1.45882835,  0.71352444],
       [-1.63051985,  0.56236194,  1.68532347, -0.56927092],
       [-1.56979452, -1.56838081, -0.41097553,  0.36441315],
       [ 0.22512437,  1.32564327,  0.92009701, -0.54127501],
       [ 1.47159635,  0.59828774,  0.20404901, -0.33223705],
       [ 0.60753259,  0.6961236 , -0.16875811,  1.10665864]])

In [5]:
col_names = ['one','two','three','four']
col_names

['one', 'two', 'three', 'four']

In [77]:
df = pd.DataFrame(data)#, index=dates, columns=col_names)
df

Unnamed: 0,0,1,2,3
0,-0.766022,-0.310261,1.458828,0.713524
1,-1.63052,0.562362,1.685323,-0.569271
2,-1.569795,-1.568381,-0.410976,0.364413
3,0.225124,1.325643,0.920097,-0.541275
4,1.471596,0.598288,0.204049,-0.332237
5,0.607533,0.696124,-0.168758,1.106659


Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [7]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([1, 2, 3, 4],dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,1,test,foo
1,1.0,2013-01-02,1.0,2,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,4,train,foo


In [8]:
pd.Timestamp('20200123')

Timestamp('2020-01-23 00:00:00')

In [9]:
pd.Series(1,index=list(range(4)),dtype='float32')

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32

In [10]:
np.array([1, 2, 3, 4],dtype='int32')

array([1, 2, 3, 4])

In [11]:
pd.Categorical(["test","train","test","train"])

[test, train, test, train]
Categories (2, object): [test, train]

Having specific [dtypes](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics-dtypes):

In [12]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


# Viewing Data
See the [Basics section](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics)
See the top & bottom rows of the frame

In [13]:
df = pd.read_csv('data/fifa18.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
ESP,1,Spain,3,1,2,0,6,5,1,5,B


In [14]:
df.head()

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
ESP,1,Spain,3,1,2,0,6,5,1,5,B


In [15]:
df.tail(3)

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
JPN,2,Japan,3,1,1,1,4,4,0,4[a],H
SEN,3,Senegal,3,1,1,1,4,4,0,4[a],H
POL,4,Poland,3,1,0,2,2,5,−3,3,H


Display the index, columns, and the underlying numpy data

In [16]:
df.index

Index(['URG', 'RUS', 'KSA', 'EGY', 'ESP', 'POR', 'IRN', 'MOR', 'FRA', 'DEN',
       'PER', 'AUS', 'CRO', 'ARG', 'NIG', 'ICL', 'BRA', 'SWI', 'SRB', 'COR',
       'SWE', 'MEX', 'KOR', 'GER', 'BEL', 'ENG', 'TUN', 'PAN', 'COL', 'JPN',
       'SEN', 'POL'],
      dtype='object', name='Index')

In [17]:
df.columns

Index(['Position', 'Team', 'Played', 'Won', 'Drawn', 'Lost', 'Goals for',
       'Goals against', 'Goal difference', 'Points', 'Group'],
      dtype='object')

In [18]:
df.values #inorder to show numpy representation.....

array([[1, 'Uruguay', 3, 3, 0, 0, 5, 0, '5', '9', 'A'],
       [2, 'Russia(H)', 3, 2, 0, 1, 8, 4, '4', '6', 'A'],
       [3, 'Saudi Arabia', 3, 1, 0, 2, 2, 7, '−5', '3', 'A'],
       [4, 'Egypt', 3, 0, 0, 3, 2, 6, '−4', '0', 'A'],
       [1, 'Spain', 3, 1, 2, 0, 6, 5, '1', '5', 'B'],
       [2, 'Portugal', 3, 1, 2, 0, 5, 4, '1', '5', 'B'],
       [3, 'Iran', 3, 1, 1, 1, 2, 2, '0', '4', 'B'],
       [4, 'Morocco', 3, 0, 1, 2, 2, 4, '−2', '1', 'B'],
       [1, 'France', 3, 2, 1, 0, 3, 1, '2', '7', 'C'],
       [2, 'Denmark', 3, 1, 2, 0, 2, 1, '1', '5', 'C'],
       [3, 'Peru', 3, 1, 0, 2, 2, 2, '0', '3', 'C'],
       [4, 'Australia', 3, 0, 1, 2, 2, 5, '−3', '1', 'C'],
       [1, 'Croatia', 3, 3, 0, 0, 7, 1, '6', '9', 'D'],
       [2, 'Argentina', 3, 1, 1, 1, 3, 5, '−2', '4', 'D'],
       [3, 'Nigeria', 3, 1, 0, 2, 3, 4, '−1', '3', 'D'],
       [4, 'Iceland', 3, 0, 1, 2, 2, 5, '−3', '1', 'D'],
       [1, 'Brazil', 3, 2, 1, 0, 5, 1, '4', '7', 'E'],
       [2, 'Switzerland', 3, 1, 2, 0, 5, 

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, URG to POL
Data columns (total 11 columns):
Position           32 non-null int64
Team               32 non-null object
Played             32 non-null int64
Won                32 non-null int64
Drawn              32 non-null int64
Lost               32 non-null int64
Goals for          32 non-null int64
Goals against      32 non-null int64
Goal difference    32 non-null object
Points             32 non-null object
Group              32 non-null object
dtypes: int64(7), object(4)
memory usage: 4.2+ KB


In [76]:
df_new = df.copy()

df_new['Position'] = pd.Categorical(df.Won)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, URG to POL
Data columns (total 11 columns):
Position           32 non-null category
Team               32 non-null object
Played             32 non-null int64
Won                32 non-null int64
Drawn              32 non-null int64
Lost               32 non-null int64
Goals for          32 non-null int64
Goals against      32 non-null int64
Goal difference    32 non-null object
Points             32 non-null object
Group              32 non-null object
dtypes: category(1), int64(6), object(4)
memory usage: 3.0+ KB


Describe shows a quick statistic summary of your data

In [19]:
df.describe() #25,50,75 denotes quartile ranges

Unnamed: 0,Position,Played,Won,Drawn,Lost,Goals for,Goals against
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,2.5,3.0,1.21875,0.5625,1.21875,3.8125,3.8125
std,1.135924,0.0,0.87009,0.715609,0.941323,2.070336,2.249552
min,1.0,3.0,0.0,0.0,0.0,2.0,0.0
25%,1.75,3.0,1.0,0.0,0.0,2.0,2.0
50%,2.5,3.0,1.0,0.0,1.0,3.0,4.0
75%,3.25,3.0,2.0,1.0,2.0,5.0,5.0
max,4.0,3.0,3.0,2.0,3.0,9.0,11.0


Transposing your data

In [20]:
df.T

Index,URG,RUS,KSA,EGY,ESP,POR,IRN,MOR,FRA,DEN,...,KOR,GER,BEL,ENG,TUN,PAN,COL,JPN,SEN,POL
Position,1,2,3,4,1,2,3,4,1,2,...,3,4,1,2,3,4,1,2,3,4
Team,Uruguay,Russia(H),Saudi Arabia,Egypt,Spain,Portugal,Iran,Morocco,France,Denmark,...,South Korea,Germany,Belgium,England,Tunisia,Panama,Colombia,Japan,Senegal,Poland
Played,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
Won,3,2,1,0,1,1,1,0,2,1,...,1,1,3,2,1,0,2,1,1,1
Drawn,0,0,0,0,2,2,1,1,1,2,...,0,0,0,0,0,0,0,1,1,0
Lost,0,1,2,3,0,0,1,2,0,0,...,2,2,0,1,2,3,1,1,1,2
Goals for,5,8,2,2,6,5,2,2,3,2,...,3,2,9,8,5,2,5,4,4,2
Goals against,0,4,7,6,5,4,2,4,1,1,...,3,4,2,3,8,11,2,4,4,5
Goal difference,5,4,−5,−4,1,1,0,−2,2,1,...,0,−2,7,5,−3,−9,3,0,0,−3
Points,9,6,3,0,5,5,4,1,7,5,...,3,3,9,6,3,0,6,4[a],4[a],3


In [21]:
df.head()

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
ESP,1,Spain,3,1,2,0,6,5,1,5,B


Sorting by an axis (In this case, sorting the columns in reverse alphabetical order):

In [22]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0_level_0,Won,Team,Position,Points,Played,Lost,Group,Goals for,Goals against,Goal difference,Drawn
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,3,Uruguay,1,9,3,0,A,5,0,5,0
RUS,2,Russia(H),2,6,3,1,A,8,4,4,0
KSA,1,Saudi Arabia,3,3,3,2,A,2,7,−5,0
EGY,0,Egypt,4,0,3,3,A,2,6,−4,0
ESP,1,Spain,1,5,3,0,B,6,5,1,2
POR,1,Portugal,2,5,3,0,B,5,4,1,2
IRN,1,Iran,3,4,3,1,B,2,2,0,1
MOR,0,Morocco,4,1,3,2,B,2,4,−2,1
FRA,2,France,1,7,3,0,C,3,1,2,1
DEN,1,Denmark,2,5,3,0,C,2,1,1,2


Sorting by values

In [23]:
df.sort_values(by='Position')

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
SWE,1,Sweden,3,2,0,1,5,2,3,6,F
BRA,1,Brazil,3,2,1,0,5,1,4,7,E
FRA,1,France,3,2,1,0,3,1,2,7,C
COL,1,Colombia,3,2,0,1,5,2,3,6,H
CRO,1,Croatia,3,3,0,0,7,1,6,9,D
BEL,1,Belgium,3,3,0,0,9,2,7,9,G
ESP,1,Spain,3,1,2,0,6,5,1,5,B
JPN,2,Japan,3,1,1,1,4,4,0,4[a],H
DEN,2,Denmark,3,1,2,0,2,1,1,5,C


# Selection

Note While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, .at, .iat, .loc, .iloc and .ix.
See the indexing documentation [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing) and [MultiIndex / Advanced Indexing](https://pandas.pydata.org/pandas-docs/stable/advanced.html#advanced)

## Getting

Selecting a single column, which yields a Series, equivalent to df.one

In [24]:
df['Team']

Index
URG         Uruguay
RUS       Russia(H)
KSA    Saudi Arabia
EGY           Egypt
ESP           Spain
POR        Portugal
IRN            Iran
MOR         Morocco
FRA          France
DEN         Denmark
PER            Peru
AUS       Australia
CRO         Croatia
ARG       Argentina
NIG         Nigeria
ICL         Iceland
BRA          Brazil
SWI     Switzerland
SRB          Serbia
COR      Costa Rica
SWE          Sweden
MEX          Mexico
KOR     South Korea
GER         Germany
BEL         Belgium
ENG         England
TUN         Tunisia
PAN          Panama
COL        Colombia
JPN           Japan
SEN         Senegal
POL          Poland
Name: Team, dtype: object

In [25]:
df.Team

Index
URG         Uruguay
RUS       Russia(H)
KSA    Saudi Arabia
EGY           Egypt
ESP           Spain
POR        Portugal
IRN            Iran
MOR         Morocco
FRA          France
DEN         Denmark
PER            Peru
AUS       Australia
CRO         Croatia
ARG       Argentina
NIG         Nigeria
ICL         Iceland
BRA          Brazil
SWI     Switzerland
SRB          Serbia
COR      Costa Rica
SWE          Sweden
MEX          Mexico
KOR     South Korea
GER         Germany
BEL         Belgium
ENG         England
TUN         Tunisia
PAN          Panama
COL        Colombia
JPN           Japan
SEN         Senegal
POL          Poland
Name: Team, dtype: object

Selecting via [], which slices the rows.

In [26]:
df[0:5]

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
ESP,1,Spain,3,1,2,0,6,5,1,5,B


In [27]:
df['URG':'EGY']

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A


## Selecting by Label

See more in[ Selection by Label](https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-label)

For getting a cross section using a label

In [28]:
df.head()

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
ESP,1,Spain,3,1,2,0,6,5,1,5,B


In [29]:
df.loc['ENG']

Position                 2
Team               England
Played                   3
Won                      2
Drawn                    0
Lost                     1
Goals for                8
Goals against            3
Goal difference          5
Points                   6
Group                    G
Name: ENG, dtype: object

Selecting on a multi-axis by label

In [30]:
df.loc[:,['Team','Won']]

Unnamed: 0_level_0,Team,Won
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
URG,Uruguay,3
RUS,Russia(H),2
KSA,Saudi Arabia,1
EGY,Egypt,0
ESP,Spain,1
POR,Portugal,1
IRN,Iran,1
MOR,Morocco,0
FRA,France,2
DEN,Denmark,1


Showing label slicing, both endpoints are included

In [31]:
df.loc['URG':'EGY',['Team','Won']]

Unnamed: 0_level_0,Team,Won
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
URG,Uruguay,3
RUS,Russia(H),2
KSA,Saudi Arabia,1
EGY,Egypt,0


In [32]:
type(df.loc['URG':'EGY',['Team','Won']])

pandas.core.frame.DataFrame

Reduction in the dimensions of the returned object

In [33]:
df.loc['ENG',['Team','Won']]

Team    England
Won           2
Name: ENG, dtype: object

In [34]:
type(df.loc['ENG',['Team','Won']])

pandas.core.series.Series

For getting a scalar value

In [35]:
df.loc['ENG','Won']

2

For getting fast access to a scalar (equiv to the prior method)

In [36]:
df.at['ENG','Won']

2

In [37]:
# .at vs .loc

df_copy = df.copy()

df_copy.head(3)

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A


In [38]:
df_copy.Won.dtype

dtype('int64')

In [39]:
df_copy.at['URG', 'Won'] = 3.5
df_copy.at['URG', 'Won']

3

In [40]:
df_copy.Won.dtype

dtype('int64')

In [41]:
df_copy.loc[0, 'Won'] = 4.5
df_copy.loc[0, 'Won']

4.5

In [42]:
df_copy.Won.dtype

dtype('float64')

In [43]:
df_copy.loc['URG':'EGY', 'Team']

Index
URG         Uruguay
RUS       Russia(H)
KSA    Saudi Arabia
EGY           Egypt
Name: Team, dtype: object

In [44]:
df_copy.at['URG':'EGY', 'Team']

TypeError: 'slice('URG', 'EGY', None)' is an invalid key

df.at() doesn't have to worry about type conversion and hence is faster. Similar to why C++ is faster than Python.

## Selection by Position

See more in [Selection by Position](Selection by Position)

Select via the position of the passed integers

In [45]:
df.head()

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
ESP,1,Spain,3,1,2,0,6,5,1,5,B


In [46]:
df.iloc[3]

Position               4
Team               Egypt
Played                 3
Won                    0
Drawn                  0
Lost                   3
Goals for              2
Goals against          6
Goal difference       −4
Points                 0
Group                  A
Name: EGY, dtype: object

By integer slices, acting similar to numpy/python

By lists of integer position locations, similar to the numpy/python style

In [47]:
df.iloc[[0,2,4],[0,1, 5]]

Unnamed: 0_level_0,Position,Team,Lost
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
URG,1,Uruguay,0
KSA,3,Saudi Arabia,2
ESP,1,Spain,0


For slicing rows explicitly

In [48]:
df.iloc[1:3,:]

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A


For slicing columns explicitly

In [49]:
df.iloc[:,1:3]

Unnamed: 0_level_0,Team,Played
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
URG,Uruguay,3
RUS,Russia(H),3
KSA,Saudi Arabia,3
EGY,Egypt,3
ESP,Spain,3
POR,Portugal,3
IRN,Iran,3
MOR,Morocco,3
FRA,France,3
DEN,Denmark,3


For getting a value explicitly

In [50]:
df.iloc[1,1]

'Russia(H)'

For getting fast access to a scalar (equiv to the prior method)

In [51]:
df.iat[0,0]

1

## Boolean Indexing
Using a single column’s values to select data.

In [52]:
df[df.Won >= 2]

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
FRA,1,France,3,2,1,0,3,1,2,7,C
CRO,1,Croatia,3,3,0,0,7,1,6,9,D
BRA,1,Brazil,3,2,1,0,5,1,4,7,E
SWE,1,Sweden,3,2,0,1,5,2,3,6,F
MEX,2,Mexico,3,2,0,1,3,4,−1,6,F
BEL,1,Belgium,3,3,0,0,9,2,7,9,G
ENG,2,England,3,2,0,1,8,3,5,6,G
COL,1,Colombia,3,2,0,1,5,2,3,6,H


Selecting values from a DataFrame where a boolean condition is met.

Using the isin method for filtering:

In [53]:
df[df['Team'].isin(['England','Germany'])]

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GER,4,Germany,3,1,0,2,2,4,−2,3,F
ENG,2,England,3,2,0,1,8,3,5,6,G


> **Warning**

# Getting Data In/Out
### CSV
[Writing to a csv file](https://pandas.pydata.org/pandas-docs/stable/io.html#io-store-in-csv).

In [62]:
df.to_csv('data/fifa.csv')

In [78]:
import pandas as pd

In [79]:
df = pd.read_csv('data/fifa.csv')

In [80]:
df.head()

Unnamed: 0,Index,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
0,URG,1,Uruguay,3,3,0,0,5,0,5,9,A
1,RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
2,KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
3,EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
4,ESP,1,Spain,3,1,2,0,6,5,1,5,B


In [81]:
df['Team']

0          Uruguay
1        Russia(H)
2     Saudi Arabia
3            Egypt
4            Spain
5         Portugal
6             Iran
7          Morocco
8           France
9          Denmark
10            Peru
11       Australia
12         Croatia
13       Argentina
14         Nigeria
15         Iceland
16          Brazil
17     Switzerland
18          Serbia
19      Costa Rica
20          Sweden
21          Mexico
22     South Korea
23         Germany
24         Belgium
25         England
26         Tunisia
27          Panama
28        Colombia
29           Japan
30         Senegal
31          Poland
Name: Team, dtype: object

In [64]:
pd.read_csv('data/fifa.csv', index_col=0)

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
ESP,1,Spain,3,1,2,0,6,5,1,5,B
POR,2,Portugal,3,1,2,0,5,4,1,5,B
IRN,3,Iran,3,1,1,1,2,2,0,4,B
MOR,4,Morocco,3,0,1,2,2,4,−2,1,B
FRA,1,France,3,2,1,0,3,1,2,7,C
DEN,2,Denmark,3,1,2,0,2,1,1,5,C


Using other df.read_csv() arguments

### HDF5
Reading and writing to [HDFStores](https://pandas.pydata.org/pandas-docs/stable/io.html#io-hdf5).

Writing to a HDF5 Store:

In [57]:
# !conda install -c conda-forge pytables

In [65]:
df.to_hdf('data/fifa.h5','df')

Reading from a HDF5 Store:

In [66]:
pd.read_hdf('data/fifa.h5','df')

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
ESP,1,Spain,3,1,2,0,6,5,1,5,B
POR,2,Portugal,3,1,2,0,5,4,1,5,B
IRN,3,Iran,3,1,1,1,2,2,0,4,B
MOR,4,Morocco,3,0,1,2,2,4,−2,1,B
FRA,1,France,3,2,1,0,3,1,2,7,C
DEN,2,Denmark,3,1,2,0,2,1,1,5,C


### Excel
Reading and writing to [MS Excel](https://pandas.pydata.org/pandas-docs/stable/io.html#io-excel).

Writing to an excel file:

In [68]:
df.to_excel('data/fifa.xlsx', sheet_name='Group Stage')

Reading from an excel file:

In [70]:
# !conda install xlrd

In [72]:
pd.read_excel('data/fifa.xlsx', 'Group Stage', index_col=0, na_values=['NA'])

Unnamed: 0_level_0,Position,Team,Played,Won,Drawn,Lost,Goals for,Goals against,Goal difference,Points,Group
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
URG,1,Uruguay,3,3,0,0,5,0,5,9,A
RUS,2,Russia(H),3,2,0,1,8,4,4,6,A
KSA,3,Saudi Arabia,3,1,0,2,2,7,−5,3,A
EGY,4,Egypt,3,0,0,3,2,6,−4,0,A
ESP,1,Spain,3,1,2,0,6,5,1,5,B
POR,2,Portugal,3,1,2,0,5,4,1,5,B
IRN,3,Iran,3,1,1,1,2,2,0,4,B
MOR,4,Morocco,3,0,1,2,2,4,−2,1,B
FRA,1,France,3,2,1,0,3,1,2,7,C
DEN,2,Denmark,3,1,2,0,2,1,1,5,C
