# Getting Started with pandas

In [2]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np

## Series
It's an array containing values and index

In [3]:
obj = Series([1, 2, 3, 4])
obj

0    1
1    2
2    3
3    4
dtype: int64

In [4]:
obj.values

array([1, 2, 3, 4])

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj2 = Series([1, 2, 3, 4], index = ["a", "b", "c", "d"])
obj2

a    1
b    2
c    3
d    4
dtype: int64

In [7]:
obj2["b"]

2

In [11]:
obj2[["a", "d"]]

a    1
d    4
dtype: int64

In [8]:
obj2["c"] = 100
obj2

a      1
b      2
c    100
d      4
dtype: int64

In [12]:
obj2[obj2.values > 50]

c    100
dtype: int64

In [13]:
np.exp(obj2)

a    2.718282e+00
b    7.389056e+00
c    2.688117e+43
d    5.459815e+01
dtype: float64

In [14]:
obj2[obj2 > 50]

c    100
dtype: int64

In [15]:
obj2 * 2

a      2
b      4
c    200
d      8
dtype: int64

In [16]:
pop_data = {"nevada" : 1200, "texas" : 1450, "newyork" : 20000, "ohio" : 9500}
pop_data

{'nevada': 1200, 'newyork': 20000, 'ohio': 9500, 'texas': 1450}

In [17]:
obj3 = pd.Series(pop_data)
obj3

nevada      1200
newyork    20000
ohio        9500
texas       1450
dtype: int64

In [18]:
obj3.index

Index(['nevada', 'newyork', 'ohio', 'texas'], dtype='object')

In [19]:
obj3.values

array([ 1200, 20000,  9500,  1450])

In [20]:
pd.isnull(obj3)

nevada     False
newyork    False
ohio       False
texas      False
dtype: bool

In [21]:
obj3.isnull()

nevada     False
newyork    False
ohio       False
texas      False
dtype: bool

In [22]:
obj3 + obj3

nevada      2400
newyork    40000
ohio       19000
texas       2900
dtype: int64

In [26]:
obj3.name = "population"
obj3.index.name = "state"
obj3

state
nevada      1200
newyork    20000
ohio        9500
texas       1450
Name: population, dtype: int64

In [23]:
obj3.index = ["oregon", "virginia", "ohio"]
obj3

oregon         1200
virginia    2000000
ohio           1450
Name: population, dtype: object

## DataFrame
A pandas DataFrame has both a row and column index.

In [28]:
data = {"state" : ["ohio", "oregon", "texas", "oregon", "texas"], 
       "pop" : [2000, 2002, 2003, 2002, 2003], 
       "year" : [1990, 1991, 1992, 1991, 1992]}
data

{'pop': [2000, 2002, 2003, 2002, 2003],
 'state': ['ohio', 'oregon', 'texas', 'oregon', 'texas'],
 'year': [1990, 1991, 1992, 1991, 1992]}

In [33]:
df = pd.DataFrame(data)
df

Unnamed: 0,pop,state,year
0,2000,ohio,1990
1,2002,oregon,1991
2,2003,texas,1992
3,2002,oregon,1991
4,2003,texas,1992


In [42]:
df.rename(columns = {"pop" : "population"})

Unnamed: 0,population,state,year
0,2000,ohio,1990
1,2002,oregon,1991
2,2003,texas,1992
3,2002,oregon,1991
4,2003,texas,1992


In [44]:
df.columns = ["popn", "states", "years"]
df

Unnamed: 0,popn,states,years
0,2000,ohio,1990
1,2002,oregon,1991
2,2003,texas,1992
3,2002,oregon,1991
4,2003,texas,1992


In [47]:
states = df["states"]
states

0      ohio
1    oregon
2     texas
3    oregon
4     texas
Name: states, dtype: object

In [49]:
3 ** 2

9

In [48]:
states1 = df.states
states1

0      ohio
1    oregon
2     texas
3    oregon
4     texas
Name: states, dtype: object

In [51]:
df2 = pd.DataFrame(data, columns = ["state", "pop", "year", "rainfall"])
df2

Unnamed: 0,state,pop,year,rainfall
0,ohio,2000,1990,
1,oregon,2002,1991,
2,texas,2003,1992,
3,oregon,2002,1991,
4,texas,2003,1992,


In [52]:
df2.rainfall = 20
df2

Unnamed: 0,state,pop,year,rainfall
0,ohio,2000,1990,20
1,oregon,2002,1991,20
2,texas,2003,1992,20
3,oregon,2002,1991,20
4,texas,2003,1992,20


In [53]:
df2["rainfall"] = 15
df2

Unnamed: 0,state,pop,year,rainfall
0,ohio,2000,1990,15
1,oregon,2002,1991,15
2,texas,2003,1992,15
3,oregon,2002,1991,15
4,texas,2003,1992,15


In [60]:
df2.rainfall = np.arange(5)
df2

Unnamed: 0,state,pop,year,rainfall
0,ohio,2000,1990,0
1,oregon,2002,1991,1
2,texas,2003,1992,2
3,oregon,2002,1991,3
4,texas,2003,1992,4


In [61]:
# retrieve rows by position
df2.loc[3]

state       oregon
pop           2002
year          1991
rainfall         3
Name: 3, dtype: object

In [64]:
val = Series([50, 60, 70], index = [0, 2, 4])
val

0    50
2    60
4    70
dtype: int64

In [65]:
df2.rainfall = val
df2

Unnamed: 0,state,pop,year,rainfall
0,ohio,2000,1990,50.0
1,oregon,2002,1991,
2,texas,2003,1992,60.0
3,oregon,2002,1991,
4,texas,2003,1992,70.0


In [69]:
df2["temperature"] = ""
df2

Unnamed: 0,state,pop,year,rainfall,temperature
0,ohio,2000,1990,50.0,
1,oregon,2002,1991,,
2,texas,2003,1992,60.0,
3,oregon,2002,1991,,
4,texas,2003,1992,70.0,


In [70]:
df2.temperature = df2.rainfall
df2

Unnamed: 0,state,pop,year,rainfall,temperature
0,ohio,2000,1990,50.0,50.0
1,oregon,2002,1991,,
2,texas,2003,1992,60.0,60.0
3,oregon,2002,1991,,
4,texas,2003,1992,70.0,70.0


In [71]:
del df2["temperature"]
df2

Unnamed: 0,state,pop,year,rainfall
0,ohio,2000,1990,50.0
1,oregon,2002,1991,
2,texas,2003,1992,60.0
3,oregon,2002,1991,
4,texas,2003,1992,70.0


In [72]:
# transpose
df2.T

Unnamed: 0,0,1,2,3,4
state,ohio,oregon,texas,oregon,texas
pop,2000,2002,2003,2002,2003
year,1990,1991,1992,1991,1992
rainfall,50,60,70,,


In [72]:
df2.index

RangeIndex(start=0, stop=5, step=1)

In [73]:
df2.values

array([['ohio', 2000, 1990, 50.0],
       ['oregon', 2002, 1991, nan],
       ['texas', 2003, 1992, 60.0],
       ['oregon', 2002, 1991, nan],
       ['texas', 2003, 1992, 70.0]], dtype=object)

In [77]:
# from nested dictionaries
# pandas will interpret the outer dict keys as the columns and the inner keys as the row indices
nested_dict = {"nevada" : {"rainfall" : 20, "temp" : 30, "vote" : "republican"},
              "ohio" : {}, 
              "oregon" : {}}
nested_df = pd.DataFrame(nested_dict)
nested_df

Unnamed: 0,nevada,ohio,oregon
rainfall,20,,
temp,30,,
vote,republican,,


In [78]:
nested_df.T

Unnamed: 0,rainfall,temp,vote
nevada,20.0,30.0,republican
ohio,,,
oregon,,,


In [79]:
nested_df.index.name = "states"; nested_df.columns.name = "conditions"
nested_df

conditions,nevada,ohio,oregon
states,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rainfall,20,,
temp,30,,
vote,republican,,


In [82]:
nested_df.columns

Index(['nevada', 'ohio', 'oregon'], dtype='object', name='conditions')

In [83]:
nested_df.values

array([[20, nan, nan],
       [30, nan, nan],
       ['republican', nan, nan]], dtype=object)

## Index Objects

In [84]:
obj = pd.Series(range(4), index = ["a", "b", "c", "d"])
obj

a    0
b    1
c    2
d    3
dtype: int64

In [85]:
index = obj.index
index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [86]:
index[1:]

Index(['b', 'c', 'd'], dtype='object')

In [81]:
np.arange(4)

array([0, 1, 2, 3])

In [82]:
index = pd.Index(np.arange(4))
index

Int64Index([0, 1, 2, 3], dtype='int64')

## Descriptive Statistics

In [88]:
df2

Unnamed: 0,state,pop,year,rainfall
0,ohio,2000,1990,50.0
1,oregon,2002,1991,
2,texas,2003,1992,60.0
3,oregon,2002,1991,
4,texas,2003,1992,70.0


In [87]:
df2.describe()

Unnamed: 0,pop,year,rainfall
count,5.0,5.0,3.0
mean,2002.0,1991.2,60.0
std,1.224745,0.83666,10.0
min,2000.0,1990.0,50.0
25%,2002.0,1991.0,55.0
50%,2002.0,1991.0,60.0
75%,2003.0,1992.0,65.0
max,2003.0,1992.0,70.0


In [89]:
df2.sum()

state       ohiooregontexasoregontexas
pop                              10010
year                              9956
rainfall                           180
dtype: object

In [95]:
df2.sum(axis = 1)

0    4040.0
1    3993.0
2    4055.0
3    3993.0
4    4065.0
dtype: float64

In [96]:
df2.mean()

pop         2002.0
year        1991.2
rainfall      60.0
dtype: float64

In [97]:
df2.min()

state       ohio
pop         2000
year        1990
rainfall      50
dtype: object

In [98]:
df2.max()

state       texas
pop          2003
year         1992
rainfall       70
dtype: object

In [99]:
df2.rainfall.median()

60.0