# pandas

#### Primary pandas objects:

- Series
- DataFrame

## Introduction to pd.Series

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
s = pd.Series([1, 2, 3, 4])

In [None]:
print(s)

In [None]:
len(dir(s))

In [None]:
list(s) # just to show you that first columnt above is just an index. Not our data.

In [None]:
s[1]

In [None]:
s[[0,3]]

In [None]:
s = pd.Series([1,2,3,4], index=['a', 'b', 'c', 'd'])

In [None]:
s

In [None]:
s[['a','b']]

In [None]:
s[[0,1]]

In [None]:
s.index

In [None]:
s * 2 

In [None]:
ax = s.plot.bar()
ax.set_xticklabels(s.index, rotation='horizontal')
plt.show()

In [None]:
len(dir(s))

## Introduction to pd.DataFrame

In [11]:
d = {'A' : pd.Series([10,20,30,40]),
     'B' : [0.1, 0.2, 0.3, 0.4],
     'C' : np.array([3] * 4),
     'D' : ['jahody', 'python', 'lucie', 'fun']}

df = pd.DataFrame(d)

In [12]:
df

Unnamed: 0,A,B,C,D
0,10,0.1,3,jahody
1,20,0.2,3,python
2,30,0.3,3,lucie
3,40,0.4,3,fun


In [13]:
df.describe()

Unnamed: 0,A,B,C
count,4.0,4.0,4.0
mean,25.0,0.25,3.0
std,12.909944,0.129099,0.0
min,10.0,0.1,3.0
25%,17.5,0.175,3.0
50%,25.0,0.25,3.0
75%,32.5,0.325,3.0
max,40.0,0.4,3.0


In [14]:
print(type(d['A']))
print(type(d['B']))
print(type(d['C']))
print(type(d['D']))

<class 'pandas.core.series.Series'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>


In [15]:
df.dtypes

A      int64
B    float64
C      int64
D     object
dtype: object

In [16]:
df.head(2)

Unnamed: 0,A,B,C,D
0,10,0.1,3,jahody
1,20,0.2,3,python


In [17]:
df.tail(2)

Unnamed: 0,A,B,C,D
2,30,0.3,3,lucie
3,40,0.4,3,fun


In [18]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [19]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [20]:
df.values

array([[10, 0.1, 3, 'jahody'],
       [20, 0.2, 3, 'python'],
       [30, 0.3, 3, 'lucie'],
       [40, 0.4, 3, 'fun']], dtype=object)

In [22]:
df.T

Unnamed: 0,0,1,2,3
A,10,20,30,40
B,0.1,0.2,0.3,0.4
C,3,3,3,3
D,jahody,python,lucie,fun


In [None]:
df.sort_index(axis=0, ascending=True)

In [None]:
df.sort_values(by='D' , inplace=True)

In [None]:
df

In [None]:
len(dir(df))

### Selection by label

In [23]:
df

Unnamed: 0,A,B,C,D
0,10,0.1,3,jahody
1,20,0.2,3,python
2,30,0.3,3,lucie
3,40,0.4,3,fun


In [24]:
df['B']

0    0.1
1    0.2
2    0.3
3    0.4
Name: B, dtype: float64

In [25]:
df.A

0    10
1    20
2    30
3    40
Name: A, dtype: int64

In [26]:
type(df['A'])

pandas.core.series.Series

In [27]:
df[0:2] # slice the rows

Unnamed: 0,A,B,C,D
0,10,0.1,3,jahody
1,20,0.2,3,python


In [28]:
d = {'A' : pd.Series([10,20,30,40], index=['sample1', 'sample2', 'sample3', 'sample4']),
     'B' : [0.1, 0.2, 0.3, 0.4],
     'C' : np.array([3] * 4),
     'D' : ['jahody', 'python', 'lucie', 'fun']}

df = pd.DataFrame(d)

In [29]:
df

Unnamed: 0,A,B,C,D
sample1,10,0.1,3,jahody
sample2,20,0.2,3,python
sample3,30,0.3,3,lucie
sample4,40,0.4,3,fun


In [30]:
df.loc['sample2']

A        20
B       0.2
C         3
D    python
Name: sample2, dtype: object

In [None]:
type(df.loc['sample2'])

In [None]:
df.loc['sample2'][['B', 'C']]

In [None]:
df.loc[:,['A','B']]

### Selection by position

In [None]:
df

In [None]:
df.iloc[0:2, 0:2]

### Boolean Indexing

In [31]:
df.A

sample1    10
sample2    20
sample3    30
sample4    40
Name: A, dtype: int64

In [32]:
df.A > 20

sample1    False
sample2    False
sample3     True
sample4     True
Name: A, dtype: bool

In [33]:
df[df.A > 20]

Unnamed: 0,A,B,C,D
sample3,30,0.3,3,lucie
sample4,40,0.4,3,fun


In [None]:
df[[False, False, True, True]]

### Setting new column

In [None]:
s1 = pd.Series([100,200,300,400], index=['sample1', 'sample2', 'sample3', 'sample4'])

In [None]:
df['E'] = s1

In [None]:
df

### Setting new row

In [None]:
df

In [None]:
df.loc[len(df)] = [40,0.5, 3, 'banana',400] 

In [None]:
df

In [None]:
df.index = ['sample1', 'sample2', 'sample3', 'sample4', 'sample5']

In [None]:
df

### Missing data

In [None]:
s2 = pd.Series([1000,np.nan,3000,np.nan], index=['sample1', 'sample2', 'sample3', 'sample4'])

In [None]:
df['F'] = s2

In [None]:
df

In [None]:
df.dropna(how='any')

In [None]:
df.fillna(value='5000')

In [None]:
pd.isnull(df)

### Operations

In [None]:
df.mean()

#### Apply

In [None]:
def double_plus_two(x):
    return (x*2)+2

In [None]:
df

In [None]:
df.A.apply(double_plus_two)

In [None]:
df.A[df.A > 20].apply(double_plus_two)

In [None]:
df.A.value_counts()

In [None]:
df

### Reading data from different formats

In [34]:
titanic = pd.read_excel('titanic.xlsx')

In [35]:
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.head(5)

http://pandas.pydata.org/pandas-docs/stable/10min.html