### Data Manipulation with Pandas 


Pandas is a newer package built on top of NumPy that provides an efficient implementation of a DataFrame. DataFrames are essentially multidimensional arrays with attached row and column labels, often with heterogeneous types and/or missing data. As well as offering a convenient storage interface for labeled data, Pandas implements a number of powerful data operations familiar to users of both database frameworks and spreadsheet programs.

In [1]:
import pandas as pd

#### Pandas Series Object

In [17]:
data = pd.Series([2,4,6,8.])
data

0    2.0
1    4.0
2    6.0
3    8.0
dtype: float64

In [20]:
data.values

array([2., 4., 6., 8.])

In [21]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [24]:
data[0]

2.0

In [25]:
data[0:2]

0    2.0
1    4.0
dtype: float64

In [27]:
data = pd.Series([2,4,6,8], index=['a', 2, 'c', 4])
data

a    2
2    4
c    6
4    8
dtype: int64

In [34]:
population_dic = {'California': 39538223, 'Texas': 29145505,
'Florida': 21538187, 'New York': 20201249,
'Pennsylvania': 13002700}
population_dic

{'California': 39538223,
 'Texas': 29145505,
 'Florida': 21538187,
 'New York': 20201249,
 'Pennsylvania': 13002700}

In [32]:
population = pd.Series(population_dic)

In [33]:
population

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64

In [36]:
pd.Series(5, index=[100,200,300])  #Another example

100    5
200    5
300    5
dtype: int64

#### Pandas Data Frame Object

In [43]:
area_dict = {'California': 423967, 'Texas': 695662, 'Florida': 170312,
'New York': 141297, 'Pennsylvania': 119280}
area = pd.Series(area_dict)
area

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
dtype: int64

In [44]:
states = pd.DataFrame({'POPULATION': population, 'AREA': area})

In [45]:
states

Unnamed: 0,POPULATION,AREA
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280


In [46]:
states.index

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [49]:
states.columns

Index(['POPULATION', 'AREA'], dtype='object')

In [52]:
states['POPULATION']

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
Name: POPULATION, dtype: int64

In [53]:
pd.DataFrame(population, columns=['POPUlation_2'])

Unnamed: 0,POPUlation_2
California,39538223
Texas,29145505
Florida,21538187
New York,20201249
Pennsylvania,13002700


In [54]:
data_2 = [{'a':i, 'b':2*i}
        for i in range(5)]
pd.DataFrame(data_2)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4
3,3,6
4,4,8


In [55]:
dict_2 = {'Population': population, 'Area': area}

In [56]:
dict_2

{'Population': California      39538223
 Texas           29145505
 Florida         21538187
 New York        20201249
 Pennsylvania    13002700
 dtype: int64,
 'Area': California      423967
 Texas           695662
 Florida         170312
 New York        141297
 Pennsylvania    119280
 dtype: int64}

In [57]:
pd.DataFrame(dict_2)

Unnamed: 0,Population,Area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280
