# Pandas General Review

In [1]:
import pandas as pd
import numpy as np

In [2]:
# How to initialize a data frame locally (99.99% of the time we load from a csv, excel, db or api)
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [4]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [5]:
# DataFrames also have indexes. As you can see in the "table" above, 
# pandas has assigned a numeric, autoincremental index automatically to each "row" in our DataFrame. 
# In our case, we know that each row represents a country, so we'll just reassign the index:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [6]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [7]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [12]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 636.0+ bytes


In [13]:
df.shape

(7, 5)

In [14]:
df.value_counts()

Population  GDP       Surface Area  HDI    Continent
35.467      1785387   9984670       0.913  America      1
60.665      2167744   301336        0.873  Europe       1
63.951      2833687   640679        0.888  Europe       1
64.511      2950039   242495        0.907  Europe       1
80.940      3874437   357114        0.916  Europe       1
127.061     4602367   377930        0.891  Asia         1
318.523     17348075  9525067       0.915  America      1
dtype: int64

## Indexing, Selection and Slicing

In [15]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [22]:
# Return a row by index
df.loc['Canada']

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [23]:
# Return a row by numeric index
df.iloc[0]

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [28]:
df['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [29]:
# Maintain the DF look whilst selecting a column of data
df['Population'].to_frame()

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


In [30]:
# Non inclusive range, unlike numpy
df[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [31]:
# More indexing scenarios
df.loc['France':'Italy']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [32]:
df.loc['France':'Italy', 'Population':'Surface Area']

Unnamed: 0,Population,GDP,Surface Area
France,63.951,2833687,640679
Germany,80.94,3874437,357114
Italy,60.665,2167744,301336


In [33]:
# Selecting with int indexes
df.iloc[1].to_frame()

Unnamed: 0,France
Population,63.951
GDP,2833687
Surface Area,640679
HDI,0.888
Continent,Europe


In [34]:
# Selecting multiple indexes
df.iloc[[0, 1, -1]]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
United States,318.523,17348075,9525067,0.915,America


In [38]:
# Reverse the DataFrame's rows and columns
df.iloc[::-1, ::-1]

Unnamed: 0,Continent,HDI,Surface Area,GDP,Population
United States,America,0.915,9525067,17348075,318.523
United Kingdom,Europe,0.907,242495,2950039,64.511
Japan,Asia,0.891,377930,4602367,127.061
Italy,Europe,0.873,301336,2167744,60.665
Germany,Europe,0.916,357114,3874437,80.94
France,Europe,0.888,640679,2833687,63.951
Canada,America,0.913,9984670,1785387,35.467


## Conditional Selection (boolean arrays)

In [39]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [40]:
df['Population'] > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [41]:
df.loc[df['Population'] > 70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [43]:
# Conditional Statements in which we only return 1 row
df.loc[df['Population'] > 70, 'Population'].to_frame()

Unnamed: 0,Population
Germany,80.94
Japan,127.061
United States,318.523


In [44]:
# More stuff with conditional statements
df.loc[df['Population'] > 70, ['Population', 'GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


## Data Drop / Removal

In [45]:
df.drop('Canada')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [46]:
# Just showing that it is not a in place drop, you would have to reassingn the DataFrame in order to keep the changes.
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [47]:
df.drop(['Canada', 'Japan'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [48]:
# Dropping columns
df.drop(columns=['Population', 'HDI'])

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [49]:
# Specify the axis in case some col name = row name
df.drop(['Italy', 'Canada'], axis=0)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [50]:
df.drop(['Population', 'HDI'], axis=1)

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [51]:
df.drop(['Population', 'HDI'], axis=1)

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [52]:
# Specifying Columns also works
df.drop(['Population', 'HDI'], axis='columns')

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [53]:
df.drop(['Canada', 'Germany'], axis='rows')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


## DataFrame Operations

In [59]:
df.iloc[[0, 3, 4], 0:2]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
Italy,60.665,2167744
Japan,127.061,4602367


In [60]:
df.iloc[[0, 3, 4], 0:2] / 100

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
Italy,0.60665,21677.44
Japan,1.27061,46023.67


In [61]:
df.iloc[[0, 3, 4], 0:2] * 10

Unnamed: 0,Population,GDP
Canada,354.67,17853870
Italy,606.65,21677440
Japan,1270.61,46023670


In [62]:
df.iloc[[0, 3, 4], 0:2] ** 2

Unnamed: 0,Population,GDP
Canada,1257.908089,3187606739769
Italy,3680.242225,4699114049536
Japan,16144.497721,21181782002689


In [63]:
df.iloc[[0, 3, 4], 0:2] ** (1/2)

Unnamed: 0,Population,GDP
Canada,5.955418,1336.183745
Italy,7.788774,1472.326051
Japan,11.272134,2145.312798


## Modifying DataFrames

In [79]:
# Adding new columns
langs = pd.Series(
    ['French', 'German', 'Italian'],
    index=['France', 'Germany', 'Italy'],
    name='Language'
)

In [74]:
langs.to_frame()

Unnamed: 0,Language
France,French
Germany,German
Italy,Italian


In [81]:
df['Language'] = langs
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,"(Canada, Language)"
Canada,35.467,1785387,9984670,0.913,America,,English
France,63.951,2833687,640679,0.888,Europe,French,English
Germany,80.94,3874437,357114,0.916,Europe,German,English
Italy,60.665,2167744,301336,0.873,Europe,Italian,English
Japan,127.061,4602367,377930,0.891,Asia,,English
United Kingdom,64.511,2950039,242495,0.907,Europe,,English
United States,318.523,17348075,9525067,0.915,America,,English


In [82]:
df.drop('Canada, Language')

KeyError: "['Canada, Language'] not found in axis"

### Replacing Values per Columns

In [77]:
df['Language'] = 'English'

In [78]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,"(Canada, Language)"
Canada,35.467,1785387,9984670,0.913,America,English,English
France,63.951,2833687,640679,0.888,Europe,English,English
Germany,80.94,3874437,357114,0.916,Europe,English,English
Italy,60.665,2167744,301336,0.873,Europe,English,English
Japan,127.061,4602367,377930,0.891,Asia,English,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English,English
United States,318.523,17348075,9525067,0.915,America,English,English


In [110]:
df['Language'] = 'English'
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


### Renaming Columns

In [111]:
df.rename(
    columns={
        'HDI': 'Human Development Index',
        'Anual Popcorn Consumption': 'APC'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Argentina': 'AR'
    })

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English


In [113]:
df = df.rename(index=str.upper)
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
CANADA,35.467,1785387,9984670,0.913,America,English
FRANCE,63.951,2833687,640679,0.888,Europe,English
GERMANY,80.94,3874437,357114,0.916,Europe,English
ITALY,60.665,2167744,301336,0.873,Europe,English
JAPAN,127.061,4602367,377930,0.891,Asia,English
UNITED KINGDOM,64.511,2950039,242495,0.907,Europe,English
UNITED STATES,318.523,17348075,9525067,0.915,America,English


In [115]:
df = df.rename(index=lambda x: x.lower())
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
canada,35.467,1785387,9984670,0.913,America,English
france,63.951,2833687,640679,0.888,Europe,English
germany,80.94,3874437,357114,0.916,Europe,English
italy,60.665,2167744,301336,0.873,Europe,English
japan,127.061,4602367,377930,0.891,Asia,English
united kingdom,64.511,2950039,242495,0.907,Europe,English
united states,318.523,17348075,9525067,0.915,America,English


In [117]:
for i in df.index:
    i = i[0].upper() + i[1:]
    print(i)
df

Canada
France
Germany
Italy
Japan
United kingdom
United states


Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
canada,35.467,1785387,9984670,0.913,America,English
france,63.951,2833687,640679,0.888,Europe,English
germany,80.94,3874437,357114,0.916,Europe,English
italy,60.665,2167744,301336,0.873,Europe,English
japan,127.061,4602367,377930,0.891,Asia,English
united kingdom,64.511,2950039,242495,0.907,Europe,English
united states,318.523,17348075,9525067,0.915,America,English


In [121]:
# Dropping a column in place
df.drop('Language', inplace=True, axis=1)
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
canada,35.467,1785387,9984670,0.913,America
france,63.951,2833687,640679,0.888,Europe
germany,80.94,3874437,357114,0.916,Europe
italy,60.665,2167744,301336,0.873,Europe
japan,127.061,4602367,377930,0.891,Asia
united kingdom,64.511,2950039,242495,0.907,Europe
united states,318.523,17348075,9525067,0.915,America


### Adding Values

In [122]:
df.append(pd.Series({
    'Population': 3,
    'GDP': 5
}, name='China'))

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
canada,35.467,1785387.0,9984670.0,0.913,America
france,63.951,2833687.0,640679.0,0.888,Europe
germany,80.94,3874437.0,357114.0,0.916,Europe
italy,60.665,2167744.0,301336.0,0.873,Europe
japan,127.061,4602367.0,377930.0,0.891,Asia
united kingdom,64.511,2950039.0,242495.0,0.907,Europe
united states,318.523,17348075.0,9525067.0,0.915,America
China,3.0,5.0,,,


In [123]:
df.loc['China'] = pd.Series({'Population': 1_400_000_000, 'Continent': 'Asia'})

In [124]:
df.reset_index()

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent
0,canada,35.467,1785387.0,9984670.0,0.913,America
1,france,63.951,2833687.0,640679.0,0.888,Europe
2,germany,80.94,3874437.0,357114.0,0.916,Europe
3,italy,60.665,2167744.0,301336.0,0.873,Europe
4,japan,127.061,4602367.0,377930.0,0.891,Asia
5,united kingdom,64.511,2950039.0,242495.0,0.907,Europe
6,united states,318.523,17348075.0,9525067.0,0.915,America
7,China,1400000000.0,,,,Asia


In [125]:
df.set_index('Population')

Unnamed: 0_level_0,GDP,Surface Area,HDI,Continent
Population,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35.467,1785387.0,9984670.0,0.913,America
63.951,2833687.0,640679.0,0.888,Europe
80.94,3874437.0,357114.0,0.916,Europe
60.665,2167744.0,301336.0,0.873,Europe
127.061,4602367.0,377930.0,0.891,Asia
64.511,2950039.0,242495.0,0.907,Europe
318.523,17348075.0,9525067.0,0.915,America
1400000000.0,,,,Asia


## Creating columns from other columns

In [126]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
canada,35.467,1785387.0
france,63.951,2833687.0
germany,80.94,3874437.0
italy,60.665,2167744.0
japan,127.061,4602367.0
united kingdom,64.511,2950039.0
united states,318.523,17348075.0
China,1400000000.0,


In [127]:
df['GDP'] / df['Population']

canada            50339.385908
france            44310.284437
germany           47868.013343
italy             35733.025633
japan             36221.712406
united kingdom    45729.239975
united states     54464.120330
China                      NaN
dtype: float64

In [128]:
df['GDP Per Capita'] = df['GDP'] / df['Population']

In [129]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP Per Capita
canada,35.467,1785387.0,9984670.0,0.913,America,50339.385908
france,63.951,2833687.0,640679.0,0.888,Europe,44310.284437
germany,80.94,3874437.0,357114.0,0.916,Europe,47868.013343
italy,60.665,2167744.0,301336.0,0.873,Europe,35733.025633
japan,127.061,4602367.0,377930.0,0.891,Asia,36221.712406
united kingdom,64.511,2950039.0,242495.0,0.907,Europe,45729.239975
united states,318.523,17348075.0,9525067.0,0.915,America,54464.12033
China,1400000000.0,,,,Asia,


## Stats

In [130]:
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP Per Capita
canada,35.467,1785387.0,9984670.0,0.913,America,50339.385908
france,63.951,2833687.0,640679.0,0.888,Europe,44310.284437
germany,80.94,3874437.0,357114.0,0.916,Europe,47868.013343
italy,60.665,2167744.0,301336.0,0.873,Europe,35733.025633
japan,127.061,4602367.0,377930.0,0.891,Asia,36221.712406


In [131]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI,GDP Per Capita
count,8.0,7.0,7.0,7.0,7.0
mean,175000100.0,5080248.0,3061327.0,0.900429,44952.254576
std,494974700.0,5494020.0,4576187.0,0.016592,6954.983875
min,35.467,1785387.0,242495.0,0.873,35733.025633
25%,63.1295,2500716.0,329225.0,0.8895,40265.998421
50%,72.7255,2950039.0,377930.0,0.907,45729.239975
75%,174.9265,4238402.0,5082873.0,0.914,49103.699626
max,1400000000.0,17348080.0,9984670.0,0.916,54464.12033


In [132]:
population = df['Population']

In [135]:
population.min(), population.max()

(35.467, 1400000000.0)

In [136]:
population.sum()

1400000751.118

In [137]:
population.sum() / len(population)

175000093.88975

In [138]:
population.mean()

175000093.88975

In [139]:
population.std()

494974708.8934035

In [140]:
population.median()

72.7255

In [141]:
population.describe()

count    8.000000e+00
mean     1.750001e+08
std      4.949747e+08
min      3.546700e+01
25%      6.312950e+01
50%      7.272550e+01
75%      1.749265e+02
max      1.400000e+09
Name: Population, dtype: float64

In [142]:
population.quantile(.25)

63.1295

In [143]:
population.quantile([.2, .4, .6, .8, 1])

0.2    6.197940e+01
0.4    6.439900e+01
0.6    9.016420e+01
0.8    2.419382e+02
1.0    1.400000e+09
Name: Population, dtype: float64