In [3]:
import pandas as pd
import numpy as np

In [4]:
np.random.randn(2,3)

array([[ 0.27136293,  1.08026655,  0.40101736],
       [-0.18880841, -0.30730659, -0.80242815]])

# 1.Create Series
Using **Pandas**, it is possible to create series, an specific type of data in *pandas*, which is similar to python **dictionary** type. A **one-dimensional** labeled array capable of holding any data type. 

In [5]:
s = pd.Series(data=[3,-5,7,4], index=['a','b','c','d'])
print(s)

a    3
b   -5
c    7
d    4
dtype: int64


## 1-1.Data Selection

In [6]:
print(s['b'])

-5


# 2.Create DataFrame
A **two-dimensional** labeled data sturcture with columns of potentially different types. It is like a *2-D* array.

In [7]:
data = {'Country':['Belgium', 'India', 'Brazil'],
       'Capital':['Brussels', 'New Delhi', 'Brasília'],
       'Population':[11190846, 1303171035, 207847528]}

In [8]:
type(data)

dict

In [9]:
print(data)

{'Country': ['Belgium', 'India', 'Brazil'], 'Capital': ['Brussels', 'New Delhi', 'Brasília'], 'Population': [11190846, 1303171035, 207847528]}


In [10]:
df = pd.DataFrame(data=data, columns=["Country", "Capital","Population"])

In [11]:
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [12]:
df1 = pd.DataFrame(data=np.random.randn(4,10), 
                  columns=["feat{}".format(i) for i in range(10)], 
                  index=["sam{}".format(j) for j in range(4)])

In [13]:
df1

Unnamed: 0,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9
sam0,0.144098,0.511253,0.662973,-0.221964,0.109188,-0.474374,0.415145,1.227298,-0.129611,1.24055
sam1,0.479327,-0.334585,-1.048048,0.779508,-1.960056,-1.711061,-1.566406,0.321689,1.354121,-0.784822
sam2,1.439213,0.215784,-0.380879,-0.847516,1.592018,0.057773,1.483329,1.696284,0.933079,-0.033022
sam3,0.621442,-0.248484,-0.765125,1.043017,-1.600819,0.698592,0.770725,0.328838,-1.167603,0.467831


## 2-1. Data Selection

In [14]:
df1[1:2:]   # Select rows of data frame

Unnamed: 0,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9
sam1,0.479327,-0.334585,-1.048048,0.779508,-1.960056,-1.711061,-1.566406,0.321689,1.354121,-0.784822


## 2-2. Selecting, Boolean Indexing & Setting

### 2-2-1. By Position

In [15]:
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [16]:
df.iloc[0,0]   # Select specific location in DaraFrame

'Belgium'

In [17]:
df.iat[0,0]    # Select specific location in DaraFrame, Faster than iloc

'Belgium'

### 2-2-2. By Label

In [18]:
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [19]:
df.loc[0,'Country']

'Belgium'

In [20]:
df.at[0,'Country']

'Belgium'

In [21]:
df1

Unnamed: 0,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9
sam0,0.144098,0.511253,0.662973,-0.221964,0.109188,-0.474374,0.415145,1.227298,-0.129611,1.24055
sam1,0.479327,-0.334585,-1.048048,0.779508,-1.960056,-1.711061,-1.566406,0.321689,1.354121,-0.784822
sam2,1.439213,0.215784,-0.380879,-0.847516,1.592018,0.057773,1.483329,1.696284,0.933079,-0.033022
sam3,0.621442,-0.248484,-0.765125,1.043017,-1.600819,0.698592,0.770725,0.328838,-1.167603,0.467831


In [22]:
df1.loc['sam0','feat0']

0.14409766224986703

### 2-2-3. Select Single Row

In [23]:
df.iloc[2]

Country          Brazil
Capital        Brasília
Population    207847528
Name: 2, dtype: object

In [24]:
df1.iloc[2]

feat0    1.439213
feat1    0.215784
feat2   -0.380879
feat3   -0.847516
feat4    1.592018
feat5    0.057773
feat6    1.483329
feat7    1.696284
feat8    0.933079
feat9   -0.033022
Name: sam2, dtype: float64

### 2-2-4. Select Single Column

In [25]:
df.ix[:,'Capital']    # Deprecated Version, instead, use loc

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


0     Brussels
1    New Delhi
2     Brasília
Name: Capital, dtype: object

In [26]:
df.loc[:,'Capital']

0     Brussels
1    New Delhi
2     Brasília
Name: Capital, dtype: object

### 2-2-5. Select Single Row and Column

In [27]:
df.ix[0,'Country']

'Belgium'

### 2-2-6. Select Multiple Rows and Columns

<img src="files/indexing.png">

In [28]:
df['Country']

0    Belgium
1      India
2     Brazil
Name: Country, dtype: object

In [29]:
df['Population']

0      11190846
1    1303171035
2     207847528
Name: Population, dtype: int64

In [30]:
df[['Country','Population']]

Unnamed: 0,Country,Population
0,Belgium,11190846
1,India,1303171035
2,Brazil,207847528


In [31]:
df.loc[:,['Country']]

Unnamed: 0,Country
0,Belgium
1,India
2,Brazil


In [32]:
df.loc[1:,['Country']]

Unnamed: 0,Country
1,India
2,Brazil


In [33]:
df.loc[1:,['Country','Population']]

Unnamed: 0,Country,Population
1,India,1303171035
2,Brazil,207847528


In [34]:
df[:2]

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035


In [35]:
df[::-1]

Unnamed: 0,Country,Capital,Population
2,Brazil,Brasília,207847528
1,India,New Delhi,1303171035
0,Belgium,Brussels,11190846


### 2-2-7. Boolean Indexing

In [36]:
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


In [37]:
df[df['Population']>1200000000]

Unnamed: 0,Country,Capital,Population
1,India,New Delhi,1303171035


## 2-2. Droping

In [38]:
df.drop('Country',axis=1)   # Not inplace Function

Unnamed: 0,Capital,Population
0,Brussels,11190846
1,New Delhi,1303171035
2,Brasília,207847528


In [39]:
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasília,207847528


## 2-3. Sorting and Ranking

In [40]:
df1

Unnamed: 0,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9
sam0,0.144098,0.511253,0.662973,-0.221964,0.109188,-0.474374,0.415145,1.227298,-0.129611,1.24055
sam1,0.479327,-0.334585,-1.048048,0.779508,-1.960056,-1.711061,-1.566406,0.321689,1.354121,-0.784822
sam2,1.439213,0.215784,-0.380879,-0.847516,1.592018,0.057773,1.483329,1.696284,0.933079,-0.033022
sam3,0.621442,-0.248484,-0.765125,1.043017,-1.600819,0.698592,0.770725,0.328838,-1.167603,0.467831


In [41]:
df1.sort_index()

Unnamed: 0,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9
sam0,0.144098,0.511253,0.662973,-0.221964,0.109188,-0.474374,0.415145,1.227298,-0.129611,1.24055
sam1,0.479327,-0.334585,-1.048048,0.779508,-1.960056,-1.711061,-1.566406,0.321689,1.354121,-0.784822
sam2,1.439213,0.215784,-0.380879,-0.847516,1.592018,0.057773,1.483329,1.696284,0.933079,-0.033022
sam3,0.621442,-0.248484,-0.765125,1.043017,-1.600819,0.698592,0.770725,0.328838,-1.167603,0.467831


In [42]:
df1.sort_values('feat1')

Unnamed: 0,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9
sam1,0.479327,-0.334585,-1.048048,0.779508,-1.960056,-1.711061,-1.566406,0.321689,1.354121,-0.784822
sam3,0.621442,-0.248484,-0.765125,1.043017,-1.600819,0.698592,0.770725,0.328838,-1.167603,0.467831
sam2,1.439213,0.215784,-0.380879,-0.847516,1.592018,0.057773,1.483329,1.696284,0.933079,-0.033022
sam0,0.144098,0.511253,0.662973,-0.221964,0.109188,-0.474374,0.415145,1.227298,-0.129611,1.24055


In [43]:
df1

Unnamed: 0,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9
sam0,0.144098,0.511253,0.662973,-0.221964,0.109188,-0.474374,0.415145,1.227298,-0.129611,1.24055
sam1,0.479327,-0.334585,-1.048048,0.779508,-1.960056,-1.711061,-1.566406,0.321689,1.354121,-0.784822
sam2,1.439213,0.215784,-0.380879,-0.847516,1.592018,0.057773,1.483329,1.696284,0.933079,-0.033022
sam3,0.621442,-0.248484,-0.765125,1.043017,-1.600819,0.698592,0.770725,0.328838,-1.167603,0.467831


## 2-4. Retrieving Series/DataFrame Information

### 2-4-1. Basic Information

In [44]:
df1.shape

(4, 10)

In [45]:
df1.index

Index(['sam0', 'sam1', 'sam2', 'sam3'], dtype='object')

In [46]:
df1.columns

Index(['feat0', 'feat1', 'feat2', 'feat3', 'feat4', 'feat5', 'feat6', 'feat7',
       'feat8', 'feat9'],
      dtype='object')

In [47]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, sam0 to sam3
Data columns (total 10 columns):
feat0    4 non-null float64
feat1    4 non-null float64
feat2    4 non-null float64
feat3    4 non-null float64
feat4    4 non-null float64
feat5    4 non-null float64
feat6    4 non-null float64
feat7    4 non-null float64
feat8    4 non-null float64
feat9    4 non-null float64
dtypes: float64(10)
memory usage: 512.0+ bytes


In [48]:
df1.count()

feat0    4
feat1    4
feat2    4
feat3    4
feat4    4
feat5    4
feat6    4
feat7    4
feat8    4
feat9    4
dtype: int64

### 2-4-2. Summary

In [49]:
df1.sum()

feat0    2.684080
feat1    0.143968
feat2   -1.531078
feat3    0.753045
feat4   -1.859669
feat5   -1.429070
feat6    1.102793
feat7    3.574108
feat8    0.989987
feat9    0.890536
dtype: float64

In [50]:
df1.cumsum()

Unnamed: 0,feat0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9
sam0,0.144098,0.511253,0.662973,-0.221964,0.109188,-0.474374,0.415145,1.227298,-0.129611,1.24055
sam1,0.623425,0.176668,-0.385075,0.557544,-1.850868,-2.185435,-1.151261,1.548986,1.224511,0.455727
sam2,2.062638,0.392452,-0.765953,-0.289972,-0.258851,-2.127662,0.332068,3.24527,2.15759,0.422705
sam3,2.68408,0.143968,-1.531078,0.753045,-1.859669,-1.42907,1.102793,3.574108,0.989987,0.890536


In [51]:
df1[["feat1"]].sum()

feat1    0.143968
dtype: float64

In [52]:
df1[["feat1"]].cumsum()

Unnamed: 0,feat1
sam0,0.511253
sam1,0.176668
sam2,0.392452
sam3,0.143968


In [53]:
df1[["feat1"]].max() / df1[["feat1"]].min()

feat1   -1.528021
dtype: float64

In [54]:
df1[["feat1"]].idxmin()

feat1    sam1
dtype: object

In [55]:
df1[["feat1"]].idxmax()

feat1    sam0
dtype: object

In [56]:
df1[["feat1"]].describe()

Unnamed: 0,feat1
count,4.0
mean,0.035992
std,0.398519
min,-0.334585
25%,-0.270009
50%,-0.01635
75%,0.289651
max,0.511253


In [57]:
df1[["feat1"]].mean()

feat1    0.035992
dtype: float64

In [58]:
df1[['feat1']].median()

feat1   -0.01635
dtype: float64

### Asking For Help
It is possible to find the **docstring** of a function using help()

In [59]:
help(pd.Series.loc)

Help on property:

    Purely label-location based indexer for selection by label.
    
    ``.loc[]`` is primarily label based, but may also be used with a
    boolean array.
    
    Allowed inputs are:
    
    - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
      interpreted as a *label* of the index, and **never** as an
      integer position along the index).
    - A list or array of labels, e.g. ``['a', 'b', 'c']``.
    - A slice object with labels, e.g. ``'a':'f'`` (note that contrary
      to usual python slices, **both** the start and the stop are included!).
    - A boolean array.
    - A ``callable`` function with one argument (the calling Series, DataFrame
      or Panel) and that returns valid output for indexing (one of the above)
    
    ``.loc`` will raise a ``KeyError`` when the items are not found.
    
    See more at :ref:`Selection by Label <indexing.label>`

