In [3]:
import pandas as pd
import numpy as np

In [5]:
geneva_pop = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523])
geneva_pop.name = 'G7 Population in millions'
print(geneva_pop)

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: G7 Population in millions, dtype: float64


In [6]:
geneva_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: G7 Population in millions, dtype: float64

In [7]:
geneva_pop.values

array([ 35.467,  63.951,  80.94 ,  60.665, 127.061,  64.511, 318.523])

Indexing

In [8]:
geneva_pop.index

RangeIndex(start=0, stop=7, step=1)

In [9]:
geneva_pop.index = [
    'Canada',
    'France',
    'Germany', 
    'Italy',
    'Japan',
    'United Kingdom', 
    'United States',
]

In [10]:
geneva_pop[-1]

318.523

In [11]:
geneva_pop.iloc[3]

60.665

In [12]:
geneva_pop[['France', 'Italy', 'Canada']]

France    63.951
Italy     60.665
Canada    35.467
Name: G7 Population in millions, dtype: float64

In [13]:
geneva_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [14]:
certificates_earned = pd.DataFrame({
    'Certificates': [8, 2, 5, 6],
    'Time (in months)': [16, 5, 9, 12]
})

certificates_earned.index = ['Tom', 'Kris', 'Ahmad', 'Beau']

print(certificates_earned.iloc[2])

Certificates        5
Time (in months)    9
Name: Ahmad, dtype: int64


In [15]:
certificates_earned = pd.Series(
    [8, 2, 5, 6],
    index=['Tom', 'Kris', 'Ahmad', 'Beau']
)

print(certificates_earned[certificates_earned > 5])

Tom     8
Beau    6
dtype: int64


In [16]:
certificates_earned = pd.Series(
    [8, 2, 5, 6],
    index=['Tom', 'Kris', 'Ahmad', 'Beau']
)

print(certificates_earned)

Tom      8
Kris     2
Ahmad    5
Beau     6
dtype: int64


Conditional Selection(Boolean Arrays)

In [17]:
geneva_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [18]:
geneva_pop > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: G7 Population in millions, dtype: bool

In [19]:
geneva_pop.mean()

107.30257142857144

In [20]:
geneva_pop.max()

318.523

In [21]:
geneva_pop.sum()

751.118

In [22]:
geneva_pop[geneva_pop > geneva_pop.mean()]

Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

DataFrame

In [23]:
data_f = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [24]:
data_f

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [25]:
data_f.index = [
    'Canada', 
    'France', 
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [26]:
data_f

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [27]:
data_f.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [28]:
data_f.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [29]:
data_f['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [30]:
data_f.iloc[1:3, 3]

France     0.888
Germany    0.916
Name: HDI, dtype: float64

In [31]:
data_f.iloc[1:3, 0:3]

Unnamed: 0,Population,GDP,Surface Area
France,63.951,2833687,640679
Germany,80.94,3874437,357114


Modifying a Data Frame

Adding a new column

In [32]:
lang = pd.Series(
    ['English', 'French', 'German', 'Italian', 'Japanese', 'English', 'English'],
    index = ['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom', 'United States'], 
    name= 'Language'
)

In [33]:
lang

Canada             English
France              French
Germany             German
Italy              Italian
Japan             Japanese
United Kingdom     English
United States      English
Name: Language, dtype: object

In [34]:
data_f['Language'] = lang

In [35]:
data_f

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,Japanese
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


Renaming Columns

In [36]:
data_f.rename(
    columns= {
        'HDI' : 'Human Development Index'
    },
    index={
        'United Kingdom' : 'UK',
        'United States' :'USA'
    }
        
)

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,Japanese
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English


In [37]:
data_f[['GDP', 'Population']]

Unnamed: 0,GDP,Population
Canada,1785387,35.467
France,2833687,63.951
Germany,3874437,80.94
Italy,2167744,60.665
Japan,4602367,127.061
United Kingdom,2950039,64.511
United States,17348075,318.523


In [38]:
data_f['GDP']/ data_f['Population']

Canada            50339.385908
France            44310.284437
Germany           47868.013343
Italy             35733.025633
Japan             36221.712406
United Kingdom    45729.239975
United States     54464.120330
dtype: float64

In [39]:
data_f['GDP per Capital'] = data_f['GDP']/ data_f['Population']

In [40]:
data_f

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per Capital
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,French,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,German,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,Italian,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,Japanese,36221.712406
United Kingdom,64.511,2950039,242495,0.907,Europe,English,45729.239975
United States,318.523,17348075,9525067,0.915,America,English,54464.12033


Test: What will the following code print out?

In [68]:

certificates_earned = pd.DataFrame({
    'Certificates': [8, 2, 5, 6],
    'Time (in months)': [16, 5, 9, 12]
})
names = ['Tom', 'Kris', 'Ahmad', 'Beau']

certificates_earned.index = names
longest_streak = pd.Series([13, 11, 9, 7], index=names)
certificates_earned['Longest streak'] = longest_streak

print(certificates_earned) 

       Certificates  Time (in months)  Longest streak
Tom               8                16              13
Kris              2                 5              11
Ahmad             5                 9               9
Beau              6                12               7


In [42]:
x = pd.Series([1,2.3, np.nan,np.nan,40])

In [43]:
pd.notnull(x)

0     True
1     True
2    False
3    False
4     True
dtype: bool

In [46]:
pd.notnull(x).sum()

3

In [44]:
pd.isnull(x)

0    False
1    False
2     True
3     True
4    False
dtype: bool

In [47]:
pd.isnull(x).sum()

2

Dropping Null Values

In [48]:
x

0     1.0
1     2.3
2     NaN
3     NaN
4    40.0
dtype: float64

In [49]:
x.dropna()

0     1.0
1     2.3
4    40.0
dtype: float64

In [59]:
new_data = pd.Series({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})

In [65]:
new_data

Column A    [1, nan, 30, nan]
Column B      [2, 8, 31, nan]
Column C    [nan, 9, 32, 100]
Column D      [5, 8, 34, 110]
dtype: object

In [66]:
new_data.shape

(4,)

In [67]:
new_data.info()

<class 'pandas.core.series.Series'>
Index: 4 entries, Column A to Column D
Series name: None
Non-Null Count  Dtype 
--------------  ----- 
4 non-null      object
dtypes: object(1)
memory usage: 236.0+ bytes
