In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
url = "https://www.alessandrobramucci.com/gapminder.csv"

df = pd.read_csv(url)

df.to_csv("data/gapminder.csv", sep = ",", decimal = ".")

In [6]:
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [7]:
df.isna().sum()

country      0
continent    0
year         0
lifeExp      0
pop          0
gdpPercap    0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [9]:
df.shape

(1704, 6)

In [13]:
# Liste der Länder
country_list = df['country'].unique()
len(country_list)

142

In [12]:
# Wie viele Länder?
country_count = df['country'].nunique()
country_count

142

In [14]:
# Wie viele Jahre pro Land?
years_count = df.groupby('country')['year'].count()
years_count

country
Afghanistan           12
Albania               12
Algeria               12
Angola                12
Argentina             12
                      ..
Vietnam               12
West Bank and Gaza    12
Yemen, Rep.           12
Zambia                12
Zimbabwe              12
Name: year, Length: 142, dtype: int64

In [15]:
sum(years_count == 12)

142

In [16]:
142 * 12

1704

In [19]:
# Wie viele Länder pro Kontinent (nur 2007)
countries_bycont_2007 = df[df['year'] == 2007].groupby(['continent'])['country'].count()
countries_bycont_2007

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64

In [24]:
# Germany filter
df_germany = df[df['country'] == 'Germany'].select_dtypes(include = 'number').reset_index(drop = True)
#df_germany

In [25]:
df_germany.to_csv('data/germany_gapminder.csv', sep = ',', decimal = '.')

In [28]:
# Durchschnittswerte pro Land über Zeit
df_mean = df.groupby('country')[['lifeExp', 'pop', 'gdpPercap']].mean()
#df_mean

In [31]:
df_mean = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].mean()
df_mean['Variable'] = 'Mean'

Unnamed: 0_level_0,lifeExp,pop,gdpPercap,Variable
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,48.86533,9916003.0,2193.754578,Mean
Americas,64.658737,24504790.0,7136.110356,Mean
Asia,60.064903,77038720.0,7902.150428,Mean
Europe,71.903686,17169760.0,14469.475533,Mean
Oceania,74.326208,8874672.0,18621.609223,Mean


In [33]:
df_median = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].median()
df_median['Variable'] = 'Median'
df_median

Unnamed: 0_level_0,lifeExp,pop,gdpPercap,Variable
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,47.792,4579311.0,1192.138217,Median
Americas,67.048,6227510.0,5465.509853,Median
Asia,61.7915,14530830.5,2646.786844,Median
Europe,72.241,8551125.0,12081.749115,Median
Oceania,73.665,6403491.5,17983.303955,Median


In [34]:
df_min = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].min()
df_min['Variable'] = 'Minimum'
df_min

Unnamed: 0_level_0,lifeExp,pop,gdpPercap,Variable
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,23.599,60011,241.165876,Minimum
Americas,37.579,662850,1201.637154,Minimum
Asia,28.801,120447,331.0,Minimum
Europe,43.585,147962,973.533195,Minimum
Oceania,69.12,1994794,10039.59564,Minimum


In [35]:
df_max = df.groupby('continent')[['lifeExp', 'pop', 'gdpPercap']].max()
df_max['Variable'] = 'Maximum'
df_max

Unnamed: 0_level_0,lifeExp,pop,gdpPercap,Variable
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,76.442,135031164,21951.21176,Maximum
Americas,80.653,301139947,42951.65309,Maximum
Asia,82.603,1318683096,113523.1329,Maximum
Europe,81.757,82400996,49357.19017,Maximum
Oceania,81.235,20434176,34435.36744,Maximum


In [37]:
df_summary = pd.concat([df_mean, df_median, df_max, df_min], axis = 0).reset_index()
df_summary

Unnamed: 0,continent,lifeExp,pop,gdpPercap,Variable
0,Africa,48.86533,9916003.0,2193.754578,Mean
1,Americas,64.658737,24504790.0,7136.110356,Mean
2,Asia,60.064903,77038720.0,7902.150428,Mean
3,Europe,71.903686,17169760.0,14469.475533,Mean
4,Oceania,74.326208,8874672.0,18621.609223,Mean
5,Africa,47.792,4579311.0,1192.138217,Median
6,Americas,67.048,6227510.0,5465.509853,Median
7,Asia,61.7915,14530830.0,2646.786844,Median
8,Europe,72.241,8551125.0,12081.749115,Median
9,Oceania,73.665,6403492.0,17983.303955,Median
