Pandas Introducion

 Numpy vs Pandas
___________________________________
Numpy:                                    
core data structure is numpy array.                      
numpy array optimises lists.                            
numpy arrays are equivalent to vectors in R and matlab.

Pandas:                                                 
core data structure is dataframe.                         
dataframes optimise python dictionaries.                        
dataframes are equivalent to pivot tables and spreadsheets in excel.                      

In [1]:
import pandas as pd
import numpy as np

In [2]:
# creating a time series objet
data = pd.Series([0.25,0.5,0.75,1.0])

In [3]:
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
data.values

array([ 0.25,  0.5 ,  0.75,  1.  ])

In [5]:
# very important: compared to numpy we have indexes
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [7]:
data=pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])

In [8]:
data['b']

0.5

In [9]:
data2=pd.Series([0.25,0.5,0.75,1.0],index=[2,5,3,7])

In [10]:
data2[7]

1.0

In [11]:
# Series as specialised (or optimised) dictionaries
# A dictionary is a structure that maps arbitrary keys to a set of arbitrary values.
# A series is a structure that maps 'typed' keys to a set of 'typed' values.
# This typing is important: it allows python to speed up using Series objects by making use of the typing.

In [13]:
# in a numpy array we can have only one particular data type
# in python dictionaries no: we can mix types
# (but for this reason dictionaries are slow)
d_={}
d_['a']=5
d_[23]='rob'
d_

{'a': 5, 23: 'rob'}

In [14]:
# creating Series objects from dictionaries:
pop_dict={'CAL': 39, 'TX': 27, 'NY': 20, 'FL': 20}

In [16]:
pop_series=pd.Series(pop_dict)
pop_series
# numbers are sorted in ascending order, strings are sorted alphabetically

CAL    39
FL     20
NY     20
TX     27
dtype: int64

In [17]:
# I can slice into series (I cannot into dictionaries)
pop_series['CAL':'NY']

CAL    39
FL     20
NY     20
dtype: int64

In [26]:
# Ways of construct series objects:
# ex:
pop_series=pd.Series(pop_dict)  # from a dictionary
index=['c','d','e']
pd.Series (data,index=index)    # from scratch, from the series object constructor

c    0.75
d    1.00
e     NaN
dtype: float64

In [18]:
# small note
pd.Series(5, index=[100,200,300])

100    5
200    5
300    5
dtype: int64

---

Pandas Dataframes

In [19]:
area_dict={'CAL': 43, 'TX': 70, 'NY': 14, 'FL':17}

In [20]:
area_series=pd.Series(area_dict)
area_series

CAL    43
FL     17
NY     14
TX     70
dtype: int64

In [21]:
pop_series

CAL    39
FL     20
NY     20
TX     27
dtype: int64

In [22]:
state=pd.DataFrame({'population': pop_series,'area':area_series})

In [23]:
state
# data get printed nicely in ipython:

Unnamed: 0,area,population
CAL,43,39
FL,17,20
NY,14,20
TX,70,27


In [24]:
state.index

Index(['CAL', 'FL', 'NY', 'TX'], dtype='object')

In [25]:
state.columns

Index(['area', 'population'], dtype='object')

In [26]:
# NOTE: 
# data series maps index to values
# dataframe maps column names to series

In [27]:
# examples of constructing data frame objects:
data = [{'a':i,'b':2*i} for i in range(3)]

In [28]:
data

[{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]

In [29]:
dict1=data[0]
dict1

{'a': 0, 'b': 0}

In [30]:
dict2=data[1]
dict2

{'a': 1, 'b': 2}

In [31]:
dict3=data[2]
dict3

{'a': 2, 'b': 4}

In [32]:
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [33]:
# the idea is that if pd.DataFrame() takes a list of dictionaries
# such that each dictionary in the list has the same keys
# the pd.DataFrame will create a DataFrame object, where its columns are the keys of the dictionaries in the list,
# and its rows are the different dictionaries.

---

In [34]:
# DataFrame selection:
state

Unnamed: 0,area,population
CAL,43,39
FL,17,20
NY,14,20
TX,70,27


In [35]:
# I want to add a column of density
state['density']=state['population']/state['area']

In [36]:
state

Unnamed: 0,area,population,density
CAL,43,39,0.906977
FL,17,20,1.176471
NY,14,20,1.428571
TX,70,27,0.385714


In [37]:
state.values
# nice way to return a matrix, or a Numpy array

array([[ 43.        ,  39.        ,   0.90697674],
       [ 17.        ,  20.        ,   1.17647059],
       [ 14.        ,  20.        ,   1.42857143],
       [ 70.        ,  27.        ,   0.38571429]])

In [38]:
type(state.values)

numpy.ndarray

In [39]:
state.T
# transposes information

Unnamed: 0,CAL,FL,NY,TX
area,43.0,17.0,14.0,70.0
population,39.0,20.0,20.0,27.0
density,0.906977,1.176471,1.428571,0.385714


In [40]:
state['CAL':'NY']

Unnamed: 0,area,population,density
CAL,43,39,0.906977
FL,17,20,1.176471
NY,14,20,1.428571


In [41]:
# to filter
state[state.density > 0.9]

Unnamed: 0,area,population,density
CAL,43,39,0.906977
FL,17,20,1.176471
NY,14,20,1.428571
