# Pandas Tutorial based on "Python for data analysis"  book

### The main data structures used on pandas are:
-  Series
-  DataFrame

In [1]:
# First we need to import the pandas library
import pandas as pd

### Series
A Series is a one-dimensional array-like object 

In [2]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

The indexes are auto created on the left and values on the right

In [3]:
# To display only the values
obj.values

array([ 4,  7, -5,  3])

In [4]:
# To display the indexes
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# To create values with indentifiers

obj2 = pd.Series([4, 7, -3, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -3
c    3
dtype: int64

In [6]:
# To drop the pd. import Series module
from pandas import Series
from pandas import *

In [7]:
obj3 = Series([1,2,3,4,5])

In [8]:
obj3

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [9]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [10]:
# The Series index can be used like a regular NumPy array 
obj2['a']

-3

In [11]:
# In case on matemathical operation the index remains
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [12]:
obj2 ** 2

d    16
b    49
a     9
c     9
dtype: int64

In [13]:
obj ** 3

0     64
1    343
2   -125
3     27
dtype: int64

In [14]:
# Series can be seen as a fixed-length, order dict
'b' in obj2

True

In [15]:
# It is possible, starting from dicts, convert them to Series
sdata = {'Ohio' : 35000, 'Texas' : 71000, 'Oregon' : 16000, 'Utah' : 5000}
obj4 = Series(sdata)


In [16]:
obj4

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [17]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj5 = Series(states)

In [19]:
obj5 = Series(sdata, index=states)
obj5

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [20]:
# to detect missing data it can be used 
pd.isnull(obj5)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [21]:
obj4.isnull()

Ohio      False
Texas     False
Oregon    False
Utah      False
dtype: bool

In [22]:
obj4 + obj5

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [23]:
# It it possible to assign class attributes such as name, and index names
obj5.name = 'population'


In [24]:
obj5

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [25]:
obj5.index.name = 'US States'
obj5

US States
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

## DataFrame

### To build DataFrames
-  using a dict
-  using NumPy arrays

In [26]:
data = {'state':['Ohio','Ohio','Ohio','Nevada', 'Nevada'], 'year' : [2000, 2001, 2002, 2001, 2002], 'pop' : [1.5, 1.7, 3.6, 2.4,2.9]}

In [28]:
frame = DataFrame(data)

In [29]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [30]:
# To specify columns sequence

DataFrame(data, columns = ['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [31]:
frame2 = DataFrame(data, columns= ['year', 'state', 'pop', 'debt'], index= ['one', 'two', 'three', 'four', 'five'])

In [32]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [None]:
# The columns can be retrived as a Series using array like notation 