# Pandas intro

* https://pandas.pydata.org/

In [3]:
import pandas as pd
pd.__version__

'0.24.2'

In [5]:
import numpy as np
np.__version__

'1.16.4'

## Series

In [10]:
# list of 1-d data
s1 = pd.Series([10,20,30])

In [11]:
type(s1)

pandas.core.series.Series

In [13]:
# each entry has an index, if not specified just consecutively numbered  
s1

0    10
1    20
2    30
dtype: int64

In [14]:
# you can also specify the index, either in paris of name/value
s2 = pd.Series({'a' : 10, 'b' : 20, 'c' : 30})
s2

a    10
b    20
c    30
dtype: int64

In [15]:
# or as an additional parameter
s3 = pd.Series([10,20,30], index=['a', 'b', 'c'])
s3

a    10
b    20
c    30
dtype: int64

In [16]:
# []-operator either gets by position 
s3[0]

10

In [17]:
# or index
s3['a']

10

In [18]:
# you can also make that explicit
s3.loc['a']

10

In [20]:
s3.iloc[0]

10

In [21]:
# more than one index is possible
s3[['a', 'c']]

a    10
c    30
dtype: int64

In [27]:
# ranges are separated with a colon, start is inclusive, end exclusive
s3[0:2]

a    10
b    20
dtype: int64

In [28]:
# start 0 is implicit and can be left out
s3[:2]

a    10
b    20
dtype: int64

In [29]:
# end is optional, gets you all values from start
s3[1:]

b    20
c    30
dtype: int64

### Advanced Series

In [32]:
# you can pass a boolean function make selection as complex as you want 
s3[lambda value: value >= 20]

b    20
c    30
dtype: int64

In [34]:
# if you want to understand how this is possible:
# Python allows for operator overloading which has been done for Series and DataFrames
# in this case the []-operator is overloaded
# http://stackoverflow.com/questions/1957780/how-to-override-operator

# here a very simple example how to do this
class MyClass:
    def __getitem__(self, key):
        return key * 2
myobj = MyClass()
myobj[3]

6

### Dataframes

In [40]:
# a dataframe consists of Series, typically, but not necessarily, they will have the same index

df1 = pd.DataFrame(
    {'one': pd.Series([10,20,30], index=['a', 'b', 'c']),
     'two': pd.Series([100,200,300], index=['a', 'b', 'c'])
    })

In [41]:
type(df1)

pandas.core.frame.DataFrame

In [42]:
df1.head()

Unnamed: 0,one,two
a,10,100
b,20,200
c,30,300


In [43]:
df1.describe()

Unnamed: 0,one,two
count,3.0,3.0
mean,20.0,200.0
std,10.0,100.0
min,10.0,100.0
25%,15.0,150.0
50%,20.0,200.0
75%,25.0,250.0
max,30.0,300.0


In [44]:
# Series can be accessed by their labels

s4 = df1['one']

In [45]:
type(s4)

pandas.core.series.Series

In [46]:
s4['a']

10

In [47]:
df1['one']['a']

10

### Data Cleaning

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/DJCordhose/ml-workshop/master/data/iris_dirty.csv',
                 header=None,
                 encoding='iso-8859-15',
                 names=['sepal length', 'sepal width', 'petal length', 'petal width', 'class'])