# 03 - data processing and visualisation using pandas

This lecture will focus on table-like data manipulation.

In [1]:
import numpy as np
import pandas as pd

### Create dictionary representing simple table

In [29]:
data = {
    "students":["Adam", "Monica", "John"], # this is first column
    "born":[1994, 1989, 2011],
    "academic degree":[None,"Bc.","MSc."],
    "active":[True,False,False]
}
data

{'students': ['Adam', 'Monica', 'John'],
 'born': [1994, 1989, 2011],
 'academic degree': [None, 'Bc.', 'MSc.'],
 'active': [True, False, False]}

### Transform this data to `pandas.DataFrame`

In [30]:
df = pd.DataFrame(data)
df.info()# prints information about dataframe
df # nice table representation in IPykernel

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
students           3 non-null object
born               3 non-null int64
academic degree    2 non-null object
active             3 non-null bool
dtypes: bool(1), int64(1), object(2)
memory usage: 155.0+ bytes


Unnamed: 0,students,born,academic degree,active
0,Adam,1994,,True
1,Monica,1989,Bc.,False
2,John,2011,MSc.,False


### Adding a new column

- by `list`, `numpy.array`, ...
- we can of course rewrite it again
- length has to match

In [31]:
df["children"] = [2,1,0] # use list
df

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,2
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,0


In [32]:
df["children"] = np.array([1,1,3]) # use array
df

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,1
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,3


In [33]:
df["children"] = [1,2] # lentgth has to match

ValueError: Length of values does not match length of index

### Accesing rows, columns and cells

- by names
- by indexes
- by masks (boolean expresions)

In [34]:
col_students = df["students"] # one column labeled 'students'
col_students

0      Adam
1    Monica
2      John
Name: students, dtype: object

In [35]:
df.students # also possibility

0      Adam
1    Monica
2      John
Name: students, dtype: object

In [36]:
df.loc[0] # row with index == `0`

students           Adam
born               1994
academic degree    None
active             True
children              1
Name: 0, dtype: object

In [37]:
df.iloc[0] # first row of dataframe

students           Adam
born               1994
academic degree    None
active             True
children              1
Name: 0, dtype: object

In [38]:
df.iloc[0,0] # integer location

'Adam'

In [39]:
filter_by = ["students", "born"]
df[filter_by] # I only want df with certain columns

Unnamed: 0,students,born
0,Adam,1994
1,Monica,1989
2,John,2011


In [40]:
df[df["born"] > 1990] # I want only students born after 1990

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,1
2,John,2011,MSc.,False,3


### Appending new data to our dataframe

In [41]:
# Create new data, which we want to add to DataFrame
new_data =  {
    "students":["Clara", "Johny", "Michael"],
    "born":[1984, 1989, 1920],
    "academic degree":["PhD.","Bc.","MSc."],
    "active":[True,False,False],
    "children":[2,0,4]
}

In [42]:
# Convert new data to dataframe and append it to the end
# of original dataframe, sort=False
df = df.append(pd.DataFrame(new_data), sort = False)
df

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,1
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,3
0,Clara,1984,PhD.,True,2
1,Johny,1989,Bc.,False,0
2,Michael,1920,MSc.,False,4


In [43]:
#Reset index values. Inplace rewrites df in place...
#...without creating a copy as a new object
#drop = false would insert a column "index"
df.reset_index(inplace = True, drop = True)
df

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,1
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,3
3,Clara,1984,PhD.,True,2
4,Johny,1989,Bc.,False,0
5,Michael,1920,MSc.,False,4


### Statistical measures - pandas is your friend

In [44]:
# Returns basic statistics of numerical data in DataFrame
df.describe()

Unnamed: 0,born,children
count,6.0,6.0
mean,1981.166667,1.833333
std,31.390551,1.47196
min,1920.0,0.0
25%,1985.25,1.0
50%,1989.0,1.5
75%,1992.75,2.75
max,2011.0,4.0


In [45]:
# Mean value for numerical data
df.mean()

born        1981.166667
active         0.333333
children       1.833333
dtype: float64

In [46]:
# Standard deviation of numerical data
df.std()

born        31.390551
active       0.516398
children     1.471960
dtype: float64

In [48]:
# Maximum, minimum, median
df.max(), df.min(), df.median()

(students    Monica
 born          2011
 active        True
 children         4
 dtype: object,
 students     Adam
 born         1920
 active      False
 children        0
 dtype: object,
 born        1989.0
 active         0.0
 children       1.5
 dtype: float64)

In [49]:
# I can apply it to `pd.Series` as well
df["born"].mean()

1981.1666666666667

In [50]:
# And again, I can use it for filtering my table
# for example I want all students born later than
# average year of birth
mask = df["born"] > df["born"].mean()
mask

0     True
1     True
2     True
3     True
4     True
5    False
Name: born, dtype: bool

In [52]:
df[mask] # use the mask to filter

Unnamed: 0,students,born,academic degree,active,children
0,Adam,1994,,True,1
1,Monica,1989,Bc.,False,1
2,John,2011,MSc.,False,3
3,Clara,1984,PhD.,True,2
4,Johny,1989,Bc.,False,0


### Sorting data

- based on some column
- ascending/descending order
- sorting by two or more columns

In [53]:
df.sort_values(["children"]) #sorting data by number of children

Unnamed: 0,students,born,academic degree,active,children
4,Johny,1989,Bc.,False,0
0,Adam,1994,,True,1
1,Monica,1989,Bc.,False,1
3,Clara,1984,PhD.,True,2
2,John,2011,MSc.,False,3
5,Michael,1920,MSc.,False,4


In [55]:
df.sort_values(["children"], ascending = False) #descending

Unnamed: 0,students,born,academic degree,active,children
5,Michael,1920,MSc.,False,4
2,John,2011,MSc.,False,3
3,Clara,1984,PhD.,True,2
0,Adam,1994,,True,1
1,Monica,1989,Bc.,False,1
4,Johny,1989,Bc.,False,0


In [56]:
#sorting by 2 categories
df.sort_values(["children","born"], ascending = [True, False])

Unnamed: 0,students,born,academic degree,active,children
4,Johny,1989,Bc.,False,0
0,Adam,1994,,True,1
1,Monica,1989,Bc.,False,1
3,Clara,1984,PhD.,True,2
2,John,2011,MSc.,False,3
5,Michael,1920,MSc.,False,4
