In [1]:
import pandas as pd
import numpy as np

In [2]:
pandas_series = pd.Series([4, 8, 8.1, -4])
pandas_series

0    4.0
1    8.0
2    8.1
3   -4.0
dtype: float64

In [3]:
pandas_series = pd.Series([4, 8, 8.1, -4], index = ["a", "c", "b", "d"])
pandas_series

a    4.0
c    8.0
b    8.1
d   -4.0
dtype: float64

In [4]:
pandas_series["a"]

4.0

In [5]:
pandas_series[0]

4.0

In [6]:
pandas_series[pandas_series > 0]

a    4.0
c    8.0
b    8.1
dtype: float64

In [7]:
dataframe = pd.DataFrame(np.random.randn(20, 5), 
                         columns = ['A', 'B', 'C', 'D', 'E'])

In [8]:
print dataframe.head()

          A         B         C         D         E
0 -0.290419  1.561366  0.444725 -1.740062 -0.862661
1 -0.526600  1.008816  1.122681  0.273012 -1.421712
2  0.213617 -0.542714 -0.303236  0.023847  0.591010
3  0.024423 -0.847366  0.619725 -0.950476  0.283424
4  0.564569 -1.047588  0.501380  0.803968  0.247216


In [13]:
print dataframe.tail(n = 2)

           A         B         C         D         E
18 -0.346790  1.755708 -0.454302  0.402435  1.254876
19 -0.708112  1.488652 -1.319781 -0.605020  1.189411


In [11]:
print dataframe.describe()

               A          B          C          D          E
count  20.000000  20.000000  20.000000  20.000000  20.000000
mean    0.007123   0.185548   0.206357  -0.171091   0.043229
std     0.838434   1.092425   0.790561   1.005419   0.779385
min    -1.287685  -1.576960  -1.319781  -1.886101  -1.421712
25%    -0.571664  -0.618877  -0.244311  -1.030358  -0.601263
50%    -0.132998   0.335455   0.321518   0.117263   0.093881
75%     0.554172   1.040512   0.597837   0.476183   0.607205
max     1.611112   1.913261   1.881350   1.450415   1.254876


In [16]:
print dataframe.sort_values(by='C').head()

           A         B         C         D         E
19 -0.708112  1.488652 -1.319781 -0.605020  1.189411
6   1.611112 -0.217754 -1.309464  0.288274  0.836276
5  -1.287685 -1.195054 -0.465735 -1.361281 -0.059454
18 -0.346790  1.755708 -0.454302  0.402435  1.254876
2   0.213617 -0.542714 -0.303236  0.023847  0.591010


In [18]:
print dataframe.sort_index(ascending = False).head()

           A         B         C         D         E
19 -0.708112  1.488652 -1.319781 -0.605020  1.189411
18 -0.346790  1.755708 -0.454302  0.402435  1.254876
17 -0.706858  0.514375  1.186717  0.753754  1.234439
16  1.343936  1.913261  0.527243 -0.364412 -0.760375
15 -0.903748 -0.259614  0.590542 -1.270001 -0.259422


In [28]:
print dataframe['A'].head()

0   -0.290419
1   -0.526600
2    0.213617
3    0.024423
4    0.564569
Name: A, dtype: float64


In [30]:
print dataframe[1:3]

          A         B         C         D         E
1 -0.526600  1.008816  1.122681  0.273012 -1.421712
2  0.213617 -0.542714 -0.303236  0.023847  0.591010


In [32]:
print dataframe[['A', 'B']].head()

          A         B
0 -0.290419  1.561366
1 -0.526600  1.008816
2  0.213617 -0.542714
3  0.024423 -0.847366
4  0.564569 -1.047588


In [116]:
df = pd.DataFrame({'A' : [4,5,6,7],
                   'B' : [10,20,30,40],
                   'C' : [100,50,-30,-50]})
df.index = ['aa', 'aa', 'bb', 'cc']
print df

    A   B    C
aa  4  10  100
aa  5  20   50
bb  6  30  -30
cc  7  40  -50


In [121]:
# label based indexing. 
print df.loc['aa']

    A   B    C
aa  4  10  100
aa  5  20   50


In [123]:
print df.loc['bb', ['A', 'B']]

A     6
B    30
Name: bb, dtype: int64


In [125]:
# positional indexing
print df.iloc[0:3]

    A   B    C
aa  4  10  100
aa  5  20   50
bb  6  30  -30


In [127]:
print df.iloc[3, [0, 1]]

A     7
B    40
Name: cc, dtype: int64


In [130]:
print df.ix[0, 'A']

4


In [135]:
print df.ix['aa', :2]

    A   B
aa  4  10
aa  5  20


In [137]:
# merging datasets

In [141]:
df1  = pd.DataFrame({'key': ['a', 'b'], 'val1': [1, 2]})
df2 = pd.DataFrame({'key': ['a', 'b'], 'val2': [4, 5]})

print "df1: "
print df1 

print "df 2"
print df2

df1: 
  key  val1
0   a     1
1   b     2
df 2
  key  val2
0   a     4
1   b     5


In [146]:
print pd.merge(df1, df2, on = 'key')

  key  val1  val2
0   a     1     4
1   b     2     5


In [149]:
print pd.concat([df1, df2], axis = 1)

  key  val1 key  val2
0   a     1   a     4
1   b     2   b     5


In [151]:
print pd.concat([df1, df2], axis = 0)

  key  val1  val2
0   a   1.0   NaN
1   b   2.0   NaN
0   a   NaN   4.0
1   b   NaN   5.0


In [152]:
# reshaping data

In [160]:
long_data = pd.DataFrame({'Country': ['IN', 'IN', 'IN', 'SL', 'SL', 'SL'],
                         'Year':[1960, 1970, 1990,1960, 1970, 1990  ],
                         'GDPpC': [83, 114, 375, 143, 183,472]})
print long_data

  Country  GDPpC  Year
0      IN     83  1960
1      IN    114  1970
2      IN    375  1990
3      SL    143  1960
4      SL    183  1970
5      SL    472  1990


In [163]:
print long_data.pivot('Country', 'Year', 'GDPpC')

Year     1960  1970  1990
Country                  
IN         83   114   375
SL        143   183   472


In [164]:
# replacing values. 

In [166]:
data = pd.Series([1, -999, 0.5,  0.4])
print data

0      1.0
1   -999.0
2      0.5
3      0.4
dtype: float64


In [169]:
data.replace(-999, 0)

0    1.0
1    0.0
2    0.5
3    0.4
dtype: float64

In [172]:
data[ data > -999]

0    1.0
2    0.5
3    0.4
dtype: float64

In [177]:
# group by demo! 

In [180]:
dataframe = pd.DataFrame({'group': ['a', 'a', 'b', 'b'],
                         'value':[12, 10, 3, 6]})
print dataframe

  group  value
0     a     12
1     a     10
2     b      3
3     b      6


In [184]:
grouped = dataframe.groupby('group')
print grouped

<pandas.core.groupby.DataFrameGroupBy object at 0x0000000008D94E80>


In [187]:
print grouped.mean()

       value
group       
a       11.0
b        4.5


In [189]:
print grouped.sum()

       value
group       
a         22
b          9


In [222]:
print grouped.apply(np.mean)

       value
group       
a       11.0
b        4.5


In [None]:
## apply working on dataframes directly. 

In [228]:
dataframe =  pd.DataFrame(np.random.randn(20, 4))
print dataframe.head()

          0         1         2         3
0  0.314065  0.554754 -0.751661 -0.107533
1  0.016495 -1.300270 -0.628004  0.137929
2 -1.290527  0.019890  0.238195 -0.844908
3 -0.237241 -0.800329 -0.674857 -0.620853
4  0.255250 -0.629075 -0.287649 -1.005348


In [230]:
dataframe.apply(np.mean, axis = 1 )

0     0.002406
1    -0.443462
2    -0.469337
3    -0.583320
4    -0.416705
5     0.495137
6     0.065748
7    -0.186396
8     0.213988
9    -0.088513
10    0.021008
11    0.151684
12   -0.157689
13   -0.044400
14    0.469043
15    0.231060
16   -0.167028
17    0.508348
18    0.016704
19   -0.777120
dtype: float64

In [231]:
 dataframe.apply(np.mean, axis = 0)

0    0.220662
1   -0.041114
2   -0.175147
3   -0.236170
dtype: float64

In [232]:
## lambda operators. 


In [235]:
f = lambda x, y: x + y
f(1, 5)

6

In [247]:
print dataframe.apply(lambda x : (x - np.mean(x)), axis = 0)

           0         1         2         3
0   0.093403  0.595867 -0.576514  0.128638
1  -0.204167 -1.259156 -0.452857  0.374100
2  -1.511188  0.061004  0.413342 -0.608738
3  -0.457903 -0.759215 -0.499710 -0.384683
4   0.034589 -0.587961 -0.112502 -0.769177
5   0.311239  0.215096  0.901278  0.784706
6   0.117941  0.489468  0.464024 -0.576673
7  -1.353063  0.060957  1.342331 -0.564042
8   1.730509  0.471536 -1.446010  0.331686
9  -0.191989 -0.560614  0.310937  0.319382
10  0.590336  0.179269  0.802486 -1.256292
11  1.349860 -0.133164  0.002676 -0.380866
12 -0.478232  0.905232 -1.055753  0.229768
13  0.388236 -1.322808 -1.037252  2.025995
14 -0.123654  0.597152  1.143754  0.490692
15  1.164145  0.388809 -1.528974  1.132029
16 -0.724733  1.088619 -0.123876 -0.676355
17  0.832170 -0.151447  1.062346  0.522092
18 -1.347605  0.513649  0.943147  0.189392
19 -0.219893 -0.792292 -0.552873 -1.311654
