# Software Carpentury: Python Novice Gapminder

## Chapter 1

In [None]:
print(7 * 3)
print(2 + 1)

$\sum_{i=1}^{N} 2^{-1} \approx 1$

## Chapter 2

In [None]:
age = 11
first_name = 'Harry'

In [None]:
print(first_name, 'is', age, 'years old')

In [None]:
atom_name = 'hydrogen'
print(atom_name[1])
print(atom_name[0:3])
print(len(atom_name))

In [None]:
a = 123
str(a)[1]

In [None]:
'abcdefg'[3:-1]

In [None]:
'abcdefaedgaer'[3:100]

## Chapter 3: Data Types and Type conversion

In [None]:
print(type(age))
print(type('average'))

In [None]:
separator = '=' * 10
print(separator)

In [None]:
print(len('abc'))
## print(len(123))
print(str(1) + '2')
print(1 + int('2'))

In [None]:
print(8 // 3, 8 / 3, 8 % 3)

In [None]:
int(float('3.5'))

In [None]:
## int('3.4') ## --> for consistency: to perform two consecutive typecasts (str->float->int), you must convert it explicitly in the code

In [None]:
type(int(1.0)+int(float('1.1')))

## Built-in functions and help

In [None]:
result = print('example')
print('The result of print is', result)

In [None]:
max('0', 'a', 'b')

Functions that are attached to objects are called methods. Methods have parentheses, come after the variable, and can be special internal funcs.

In [None]:
my_string = 'Hello world!'
print(len(my_string))
print(my_string.swapcase())
print(my_string.__len__())

In [None]:
print(my_string.upper())
print(my_string)

In [None]:
help(min)

In Jupyter Notebook (Jupyter Lab), we can place the cursor near the function, hold down Shift and press Tab to get a hover window with help.

## Libraries

A library is a collection of modules, but both terms are often used interchangeably.

In [None]:
import math

print('pi is', math.pi)
print('cos(pi) is', math.cos(math.pi))

In [None]:
from math import cos, sin, pi

print('cos(pi) is', cos(pi))

In [None]:
import matplotlib as mpl

In [None]:
math.pow(3, .5)
math.log1p(1E-6)

In [None]:
# help(random)

In [None]:
bases = 'GATATTACGA'
import random
random_index = random.randrange(len(bases))
print(bases[random_index])

print(random.sample(bases, 1)[0])

## Chapter 7: reading tabular data into DataFrames

In [None]:
import pandas as pd
data = pd.read_csv('data/gapminder_gdp_oceania.csv')
print(data)

In pandas, we use columns to store observed variables, and rows are observations.

In [None]:
## we use index_col to specificy row names (row headings)
data = pd.read_csv('data/gapminder_gdp_oceania.csv',
                   index_col='country')
print(data)

In [None]:
data.info()

In [None]:
data.columns ## data (member variable, or member), not a method, because there is no parentheses
## colnames(data) in R

In [None]:
print(data.T) ## t in R

In [None]:
print(data.describe()) ## summary in R

In [None]:
americas = pd.read_csv('data/gapminder_gdp_americas.csv', index_col='country')
americas.head(n=3)
americas.T.tail(n=2)

In [None]:
americas.T.to_csv('data/americas_T.csv')

## Chapter 8: data frames

* A DataFrame is a collection of Series, like in R a data.frame is vertically aligned lists. 
* Pandas is built upon Numpy, therefore most methods defined for Numpy Arrays also apply to Pandas Series or DataFrames.

In [None]:
## .iloc[...,...]
data = pd.read_csv('data/gapminder_gdp_europe.csv', index_col='country')
data.head()

In [None]:
print(data.iloc[0,0])

In [None]:
print(data.loc['Albania','gdpPercap_1952'])

In [None]:
print(data.loc['Albania', :])

In [None]:
print(data.loc['Albania'])

In [None]:
print(data.loc[:, 'gdpPercap_1952'])

In [None]:
print(data['gdpPercap_1952']) ## not .loc! print(data.loc['gdpPercap_1952'])

In [None]:
data.loc['Albania']  ## df.loc(rowname) selects a row
data['gdpPercap_1952'] ## df[column] selects a column

In [None]:
print(data.loc['Italy':'Poland', 'gdpPercap_1962':'gdpPercap_1972']) ## note that slicing using loc is inclusive at both ends!

In [None]:
print(data.loc['Italy':'Poland', 'gdpPercap_1962':'gdpPercap_1972'].max()) ## .max runs per Series/column

In [None]:
subset = data.loc['Italy':'Poland', 'gdpPercap_1962':'gdpPercap_1972']
print('\nWhere are values large?\n', subset > 10000)

In [None]:
mask = subset > 10000 ## a mask = a frame of booleans
print(type(mask))
print(subset[mask]) ## gives NaNs, which are ignored by operations like max, min, or average (like NA in R)

In [None]:
print(subset[mask].describe())

In [None]:
mask_higher = data > data.mean()
wealth_score = mask_higher.aggregate('sum', axis=1)/len(data.columns)
wealth_score

In [None]:
data.groupby(wealth_score).sum()

In [None]:
print(data.idxmax())
print(data.idxmin())

In [None]:
import numpy as np
print(data['gdpPercap_1982'])
print(data.loc['Denmark'])
print(data.loc[:,'gdpPercap_1982':])
print(data['gdpPercap_1982']/data['gdpPercap_1952'])


In [None]:
data.median()

## Chapter 9

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
## to make the plots of higher resolution: method 1
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
## method 2
%config InlineBackend.figure_format = 'svg'

In [None]:
time = [0, 1, 2, 3]
position = [0, 100, 200, 300]

plt.plot(time, position)
plt.xlabel('Time [h]')
plt.ylabel('Position (m)')
plt.show()

In [None]:
data = pd.read_csv('data/gapminder_gdp_oceania.csv',
                  index_col='country')
years = data.columns.str.strip('gdpPercap_')
print(years)
print(data)
## replace column names with years as integer
data.columns = years.astype(int)
data.loc['Australia'].plot()

In [None]:
years = data.columns
gdp_australia = data.loc['Australia']

plt.plot(years, gdp_australia, 'r.-')
plt.show()

In [None]:
print(data)

In [None]:
data.T.plot() ## years in rows, countries in columns
plt.ylabel('GDP per capita')
plt.show()

In [None]:
## ggplot style
plt.style.use('ggplot')
data.T.plot(kind='bar')
plt.ylabel('GDP per capita')

In [None]:
## plotting many sets of data
gdp_australia = data.loc['Australia']
gdp_nz = data.loc['New Zealand']

plt.plot(years, gdp_australia, 'b-', label='AUS')
plt.plot(years, gdp_nz, 'g-', label='NZ')

plt.legend(loc='lower right')
plt.xlabel('Year')
plt.ylabel('GDP per capita [$]')

In [None]:
## scatter: plt.scatter or DataFrame.plot.scatter
plt.scatter(gdp_australia, gdp_nz)
plt.xlabel('Australia')
plt.ylabel('New Zealand')

In [None]:
data.T.plot.scatter(x='Australia', y='New Zealand')

In [None]:
data_europe = pd.read_csv('data/gapminder_gdp_europe.csv', index_col='country')
print(data_europe)
data_europe.min().plot(label='min') ## .min() returns a Series
data_europe.max().plot(label='max')
plt.legend(loc='best')
plt.xticks(rotation=90)

In [None]:
data_asia = pd.read_csv('data/gapminder_gdp_asia.csv', index_col='country')
data_asia.max().plot() ## why max().plot()? Because .min()/.max() return a Seriesb
print(data_asia.idxmax())
print(data_asia.idxmin())

In [None]:
plt.style.available

In [None]:
data_all = pd.read_csv('data/gapminder_all.csv', index_col='country')
plt.style.use('tableau-colorblind10')
data_all.plot(kind='scatter', x='gdpPercap_2007', y='lifeExp_2007',
             s=data_all['pop_2007']/1e6)
plt.xlabel('GDP per capital 2007 [$]', fontsize=16)
plt.ylabel('Life expectance [years]', fontsize=16)
plt.tick_params(labelsize=12, length=9)
fig = plt.gcf() ## get current figure
plt.savefig('lifeExp_gdp_2007.png', dpi=300)

In [None]:
data.plot(kind='bar')
fig.savefig('myfig.png')

## Chapter 11: lists

In [None]:
pressures = [.273, .275, .277, .275, .276]
print('pressures:', pressures)
print('length of pressures:', len(pressures))

In [None]:
## list is mutable
pressures[0] = 0.265
print('pressures now:', pressures)

In [None]:
## .extend versus .append: .extend extends a list with another list by adding the elements, .append adds the list as an element
teen_primes = [11, 13, 17, 19]
midage_primes = [37, 41, 43, 47]
primes = [2, 3, 5]
primes.extend(teen_primes)
print('primes has now become:', primes)
primes.append(midage_primes)
print('primes has finally become:', primes)

In another word, `extend` maintains the flattend structure of a list, whereas `appending` a list to a list makes the result two-dimensional.

In [None]:
## the statement del
primes = [2, 3, 5, 7, 9]
del primes[4]
print(primes)

In [None]:
## list can be of mixed type
this_and_that = [1, 'asf', 3.4]
print(this_and_that)

In [None]:
## character strings are lists, but immutable
element="Chemistry"
## element[0] = 'C'
## and IndexError can be generated when the index is out of range
## element[99]

In [None]:
list(element)[0]

In [None]:
element = 'potassium'
print(element[::2])
print(element[::-1])

In [None]:
element = 'lithium'
print(element[0:20])
print(element[-1:3]) ## no output! because per default the stride is 1
print(element[-1:3:-1])

In [None]:
## list(str) converts a string to a list of characters
list('love')

### sorted and sort

`sorted` returns a sorted *copy* of the ltters, whereas `.sort` sorts the list *in place* and does not return anything.

In [None]:
letters = list('gold')
result = sorted(letters)
print('letters is', letters, 'and result is', result)

In [None]:
# Program B
letters = list('gold')
result = letters.sort()
print('letters is', letters, 'and result is', result)

 ## Copying or not

In [None]:
old = list('gold')
new = old
new[0] = 'D'
print('new is', new, 'and old is', old)

In [None]:
## [:] generates a copy
old = list('gold')
new = old[:]
new[0] = 'D'
print('new is', new, 'and old is', old)

## Loop

In [None]:
for number in [2, 3, 5]:
    print(number)

The built-in function `range` produces a sequence of numbers. However, a range is not a list: the numberes are produced *on demand* to make looping over large ranges more efficient. Like slicing, its format is `[start:end)`.

In [None]:
for number in range(0,5):
    print(number)

## Chapter 13: conditions

In [None]:
def is_higher_than_30k(exp):
    if exp > 30E3:
        return True;
    else:
        return False
data[2007].apply(is_higher_than_30k)

## Chapter 14: looping over datasets

In [None]:
for filename in ['data/gapminder_gdp_africa.csv',
                'data/gapminder_gdp_asia.csv']:
    data = pd.read_csv(filename, index_col='country')
    print(filename, data.min().min())

In [None]:
data.shape[0]

Globbing means 'matching a set of files with a pattern?.

In [None]:
import glob
glob.glob('data/*.csv')

In [None]:
fig, ax = plt.subplots(1,1)

for file in glob.glob('data/gapminder_gdp*.csv'):
    dataframe = pd.read_csv(file)
    region = file.split('_')[-1][:-4]
    dataframe.mean().plot(ax=ax, label=region)
plt.legend()
plt.xticks(rotation=45, ha='right')
plt.show()