# 10 Minutes Pandas [link](https://pandas.pydata.org/docs/user_guide/10min.html#min)

In [1]:
import pandas as pd
import numpy as np

In [2]:
# series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [3]:
# dataframs (index are the keyes)  (columns are the titles)
dates = pd.date_range('20130101', periods=6) #year-month-day
print(dates)

print()
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')



Unnamed: 0,A,B,C,D
2013-01-01,0.368141,0.050987,0.283732,0.024102
2013-01-02,1.453156,-0.303845,-0.325767,-0.091905
2013-01-03,-0.489373,1.959238,0.28792,0.8343
2013-01-04,0.775866,-1.087637,-0.134552,-0.105819
2013-01-05,-0.436288,-0.913931,0.205661,1.943312
2013-01-06,-1.056582,-0.936091,0.494788,-1.429302


In [4]:
# creating data frame form dictionary
df2 = pd.DataFrame(
    {
        'A': 1.0,
        'B': pd.Timestamp('20130102'),
        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
        'D': np.array([3]*4, dtype=np.int32),
        'E': pd.Categorical(['test', 'train', 'test', 'train']),
        'F': 'foo'
    }
)
#datafram columns type
print(df2.dtypes)
df2

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [5]:
# accessing a column returning a series
print(df2.C)
print(df2['C'])

#displaying index (keys)
print('\n', df2.index)
print(list(df2.index))

#displaying columns
print('\n', df2.columns)
print(list(df2.columns))

0    1.0
1    1.0
2    1.0
3    1.0
Name: C, dtype: float32
0    1.0
1    1.0
2    1.0
3    1.0
Name: C, dtype: float32

 Int64Index([0, 1, 2, 3], dtype='int64')
[0, 1, 2, 3]

 Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
['A', 'B', 'C', 'D', 'E', 'F']


In [6]:
# converting data frame to numpy
'''
    if all data types are the same pandas will return a numpy array with the save datatype
    if columns are not the same datatype will return them as object and we nedd to cast every column in order to use it
        in addition it is heavy operatoin
'''
print(df.to_numpy())
np_object = df2.to_numpy() # array of objects
type(np_object[0][1])

[[ 0.36814091  0.05098719  0.28373174  0.02410217]
 [ 1.45315637 -0.30384482 -0.32576748 -0.09190474]
 [-0.4893726   1.95923781  0.2879202   0.83429992]
 [ 0.77586633 -1.08763723 -0.13455157 -0.10581946]
 [-0.43628811 -0.9139313   0.20566055  1.94331158]
 [-1.05658184 -0.93609149  0.49478791 -1.4293016 ]]


pandas._libs.tslibs.timestamps.Timestamp

In [7]:
# describing the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.102487,-0.205213,0.135297,0.195781
std,0.930842,1.146765,0.30493,1.123002
min,-1.056582,-1.087637,-0.325767,-1.429302
25%,-0.476101,-0.930551,-0.049499,-0.102341
50%,-0.034074,-0.608888,0.244696,-0.033901
75%,0.673935,-0.037721,0.286873,0.63175
max,1.453156,1.959238,0.494788,1.943312


In [8]:
# transposing
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.368141,1.453156,-0.489373,0.775866,-0.436288,-1.056582
B,0.050987,-0.303845,1.959238,-1.087637,-0.913931,-0.936091
C,0.283732,-0.325767,0.28792,-0.134552,0.205661,0.494788
D,0.024102,-0.091905,0.8343,-0.105819,1.943312,-1.429302


In [9]:
# sort by index of column
'''
axis=0 sort raws by index(keys)
axis=1 sort columns by column's mean value
'''
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.024102,0.283732,0.050987,0.368141
2013-01-02,-0.091905,-0.325767,-0.303845,1.453156
2013-01-03,0.8343,0.28792,1.959238,-0.489373
2013-01-04,-0.105819,-0.134552,-1.087637,0.775866
2013-01-05,1.943312,0.205661,-0.913931,-0.436288
2013-01-06,-1.429302,0.494788,-0.936091,-1.056582


In [10]:
# sort by a specific columns
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,0.775866,-1.087637,-0.134552,-0.105819
2013-01-06,-1.056582,-0.936091,0.494788,-1.429302
2013-01-05,-0.436288,-0.913931,0.205661,1.943312
2013-01-02,1.453156,-0.303845,-0.325767,-0.091905
2013-01-01,0.368141,0.050987,0.283732,0.024102
2013-01-03,-0.489373,1.959238,0.28792,0.8343


# Selection

## Selection by label 
### (i.e: index_name\s, or column_name\s)  = `df.loc['20130101', 'A']`
### using: `DataFrame.loc[]` or, `DataFrame.at[]`

In [11]:
# get a column -> return a series
print(df['A'])
print(df.A)

2013-01-01    0.368141
2013-01-02    1.453156
2013-01-03   -0.489373
2013-01-04    0.775866
2013-01-05   -0.436288
2013-01-06   -1.056582
Freq: D, Name: A, dtype: float64
2013-01-01    0.368141
2013-01-02    1.453156
2013-01-03   -0.489373
2013-01-04    0.775866
2013-01-05   -0.436288
2013-01-06   -1.056582
Freq: D, Name: A, dtype: float64


In [12]:
# selection of raws -> return a Dataframe NOTE df[0] is not allowed use df.loc
df[0:3] #using normal python indexing (end=3 is not includes)
df['20130101':'20130103'] #from start until end (end in includes unlike python indexing)

Unnamed: 0,A,B,C,D
2013-01-01,0.368141,0.050987,0.283732,0.024102
2013-01-02,1.453156,-0.303845,-0.325767,-0.091905
2013-01-03,-0.489373,1.959238,0.28792,0.8343


In [13]:
df[:1]

Unnamed: 0,A,B,C,D
2013-01-01,0.368141,0.050987,0.283732,0.024102


In [14]:
# seleting a single raw -> return:
'''
    a series: given a single index
    a DataFrame given multiple indcies
'''
indcies = df.index
print(df.loc[indcies[0]]) # a Series
df.loc[indcies[0:3]] # a DataFrame

A    0.368141
B    0.050987
C    0.283732
D    0.024102
Name: 2013-01-01 00:00:00, dtype: float64


Unnamed: 0,A,B,C,D
2013-01-01,0.368141,0.050987,0.283732,0.024102
2013-01-02,1.453156,-0.303845,-0.325767,-0.091905
2013-01-03,-0.489373,1.959238,0.28792,0.8343


In [15]:
# slecting multiple axis by label (column)
df.loc[:,['A', 'B']] # input is (index, columns list)

Unnamed: 0,A,B
2013-01-01,0.368141,0.050987
2013-01-02,1.453156,-0.303845
2013-01-03,-0.489373,1.959238
2013-01-04,0.775866,-1.087637
2013-01-05,-0.436288,-0.913931
2013-01-06,-1.056582,-0.936091


In [18]:
# selecting by index and columns
df.loc['20130101': '20130104',['A', 'B']] # input is (index, columns list)

Unnamed: 0,A,B
2013-01-01,0.368141,0.050987
2013-01-02,1.453156,-0.303845
2013-01-03,-0.489373,1.959238
2013-01-04,0.775866,-1.087637


In [19]:
#selecting specifc set of raws
df.loc[['20130101', '20130104'],['A', 'B']] # input is (index, columns list)

Unnamed: 0,A,B
2013-01-01,0.368141,0.050987
2013-01-04,0.775866,-1.087637


In [21]:
# getting a cell 
val = df.loc['20130101', 'A']
print(type(val))
val

<class 'numpy.float64'>


0.3681409078108381

In [22]:
# fast access similar to the above 
df.at['20130101', 'A']

0.3681409078108381

## Selecting by position 
## (i.e: index of index(key)\s, and index of column\s) = `df.iloc[0, 0]`
### `DataFrame.iloc[]`, or `DataFrame.iat[]`

In [23]:
# selecting a row
'''
return: 
    a Series if selecting a single raw
    a DataFrame if selecting multiple raws, or columns
'''
df.iloc[3] # forth raw

A    0.775866
B   -1.087637
C   -0.134552
D   -0.105819
Name: 2013-01-04 00:00:00, dtype: float64

In [26]:
#selecting multiple raws and columns using interger slicing
df.iloc[0:4, 0: 3]

Unnamed: 0,A,B,C
2013-01-01,0.368141,0.050987,0.283732
2013-01-02,1.453156,-0.303845,-0.325767
2013-01-03,-0.489373,1.959238,0.28792
2013-01-04,0.775866,-1.087637,-0.134552


In [27]:
# specific raws and columns
df.iloc[[0, 4], [0, 3]]

Unnamed: 0,A,D
2013-01-01,0.368141,0.024102
2013-01-05,-0.436288,1.943312


In [30]:
# specifc raws and all columns
df.iloc[[0, 4], :]

Unnamed: 0,A,B,C,D
2013-01-01,0.368141,0.050987,0.283732,0.024102
2013-01-05,-0.436288,-0.913931,0.205661,1.943312


In [33]:
# all raws and specifc columns
df.iloc[:, [0, 3]]

Unnamed: 0,A,D
2013-01-01,0.368141,0.024102
2013-01-02,1.453156,-0.091905
2013-01-03,-0.489373,0.8343
2013-01-04,0.775866,-0.105819
2013-01-05,-0.436288,1.943312
2013-01-06,-1.056582,-1.429302


In [36]:
# access a cell i.e: scaler
df.iloc[0, 0]

0.3681409078108381

In [37]:
# fast access to scaler
df.iat[0, 0]

0.3681409078108381

## Boolean Indexing (same as numpy)

In [50]:
x = np.random.rand(4, 4, 3)
print(x)
x[x>.9]

[[[0.01304764 0.04824457 0.20301216]
  [0.06254942 0.5453693  0.30290542]
  [0.41711243 0.46400275 0.20920544]
  [0.95085339 0.94742904 0.68949172]]

 [[0.74238752 0.1354469  0.783307  ]
  [0.78105288 0.1814457  0.51661039]
  [0.75502123 0.85303588 0.1865035 ]
  [0.2685296  0.29158585 0.27267997]]

 [[0.96542622 0.19889912 0.61688856]
  [0.83452814 0.9347268  0.24858218]
  [0.65217638 0.23335866 0.51743565]
  [0.59888006 0.60215339 0.71273818]]

 [[0.72222566 0.80660856 0.83924984]
  [0.96910775 0.49223105 0.07403595]
  [0.37179724 0.45649831 0.95658191]
  [0.65061053 0.81759832 0.08772979]]]


array([0.95085339, 0.94742904, 0.96542622, 0.9347268 , 0.96910775,
       0.95658191])