# 10 Minutes Pandas [link](https://pandas.pydata.org/docs/user_guide/10min.html#min)

In [1]:
import pandas as pd
import numpy as np

In [8]:
# series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [14]:
# dataframs (index are the keyes)  (columns are the titles)
dates = pd.date_range('20130101', periods=6) #year-month-day
print(dates)

print()
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')



Unnamed: 0,A,B,C,D
2013-01-01,-1.249903,0.969349,0.247641,1.412217
2013-01-02,-0.130585,-0.913925,-1.218588,-0.215834
2013-01-03,-0.004108,0.202386,-0.058036,0.258611
2013-01-04,-0.4859,0.377014,0.130194,1.053947
2013-01-05,-0.760518,0.042109,-0.441259,0.023702
2013-01-06,0.228507,-0.115093,0.484268,-0.785581


In [19]:
# creating data frame form dictionary
df2 = pd.DataFrame(
    {
        'A': 1.0,
        'B': pd.Timestamp('20130102'),
        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
        'D': np.array([3]*4, dtype=np.int32),
        'E': pd.Categorical(['test', 'train', 'test', 'train']),
        'F': 'foo'
    }
)
#datafram columns type
print(df2.dtypes)
df2

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [54]:
# accessing a column returning a series
print(df2.C)
print(df2['C'])

#displaying index (keys)
print('\n', df2.index)
print(list(df2.index))

#displaying columns
print('\n', df2.columns)
print(list(df2.columns))

0    1.0
1    1.0
2    1.0
3    1.0
Name: C, dtype: float32
0    1.0
1    1.0
2    1.0
3    1.0
Name: C, dtype: float32

 Int64Index([0, 1, 2, 3], dtype='int64')
[0, 1, 2, 3]

 Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
['A', 'B', 'C', 'D', 'E', 'F']


In [55]:
# converting data frame to numpy
'''
    if all data types are the same pandas will return a numpy array with the save datatype
    if columns are not the same datatype will return them as object and we nedd to cast every column in order to use it
        in addition it is heavy operatoin
'''
print(df.to_numpy())
np_object = df2.to_numpy() # array of objects
type(np_object[0][1])

[[-1.24990261  0.96934885  0.24764072  1.41221738]
 [-0.13058489 -0.913925   -1.21858786 -0.2158337 ]
 [-0.00410828  0.20238575 -0.05803597  0.25861126]
 [-0.48590023  0.37701357  0.13019446  1.05394663]
 [-0.76051751  0.04210866 -0.44125856  0.02370169]
 [ 0.22850748 -0.1150933   0.48426831 -0.78558056]]


pandas._libs.tslibs.timestamps.Timestamp

In [56]:
# describing the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.400418,0.09364,-0.14263,0.291177
std,0.544797,0.619743,0.612171,0.815796
min,-1.249903,-0.913925,-1.218588,-0.785581
25%,-0.691863,-0.075793,-0.345453,-0.15595
50%,-0.308243,0.122247,0.036079,0.141156
75%,-0.035727,0.333357,0.218279,0.855113
max,0.228507,0.969349,0.484268,1.412217


In [57]:
# transposing
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.249903,-0.130585,-0.004108,-0.4859,-0.760518,0.228507
B,0.969349,-0.913925,0.202386,0.377014,0.042109,-0.115093
C,0.247641,-1.218588,-0.058036,0.130194,-0.441259,0.484268
D,1.412217,-0.215834,0.258611,1.053947,0.023702,-0.785581


In [58]:
# sort by index of column
'''
axis=0 sort raws by index(keys)
axis=1 sort columns by column's mean value
'''
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.412217,0.247641,0.969349,-1.249903
2013-01-02,-0.215834,-1.218588,-0.913925,-0.130585
2013-01-03,0.258611,-0.058036,0.202386,-0.004108
2013-01-04,1.053947,0.130194,0.377014,-0.4859
2013-01-05,0.023702,-0.441259,0.042109,-0.760518
2013-01-06,-0.785581,0.484268,-0.115093,0.228507


In [59]:
# sort by a specific columns
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-02,-0.130585,-0.913925,-1.218588,-0.215834
2013-01-06,0.228507,-0.115093,0.484268,-0.785581
2013-01-05,-0.760518,0.042109,-0.441259,0.023702
2013-01-03,-0.004108,0.202386,-0.058036,0.258611
2013-01-04,-0.4859,0.377014,0.130194,1.053947
2013-01-01,-1.249903,0.969349,0.247641,1.412217


# Selection

In [60]:
# get a column -> return a series
print(df['A'])
print(df.A)

2013-01-01   -1.249903
2013-01-02   -0.130585
2013-01-03   -0.004108
2013-01-04   -0.485900
2013-01-05   -0.760518
2013-01-06    0.228507
Freq: D, Name: A, dtype: float64
2013-01-01   -1.249903
2013-01-02   -0.130585
2013-01-03   -0.004108
2013-01-04   -0.485900
2013-01-05   -0.760518
2013-01-06    0.228507
Freq: D, Name: A, dtype: float64


In [66]:
# selection of raws -> return a Dataframe NOTE df[0] is not allowed use df.loc
df[0:3] #using normal python indexing (end=3 is not includes)
df['20130101':'20130103'] #from start until end (end in includes unlike python indexing)

Unnamed: 0,A,B,C,D
2013-01-01,-1.249903,0.969349,0.247641,1.412217
2013-01-02,-0.130585,-0.913925,-1.218588,-0.215834
2013-01-03,-0.004108,0.202386,-0.058036,0.258611


In [68]:
df[:1]

Unnamed: 0,A,B,C,D
2013-01-01,-1.249903,0.969349,0.247641,1.412217


In [85]:
# seleting a single raw -> return:
'''
    a series: given a single index
    a DataFrame given multiple indcies
'''
indcies = df.index
print(df.loc[indcies[0]]) # a Series
df.loc[indcies[0:3]] # a DataFrame

A   -1.249903
B    0.969349
C    0.247641
D    1.412217
Name: 2013-01-01 00:00:00, dtype: float64


Unnamed: 0,A,B,C,D
2013-01-01,-1.249903,0.969349,0.247641,1.412217
2013-01-02,-0.130585,-0.913925,-1.218588,-0.215834
2013-01-03,-0.004108,0.202386,-0.058036,0.258611


In [87]:
# slecting multiple axis by label (column)
df.loc[:,['A', 'B']] # input is (index, columns list)

Unnamed: 0,A,B
2013-01-01,-1.249903,0.969349
2013-01-02,-0.130585,-0.913925
2013-01-03,-0.004108,0.202386
2013-01-04,-0.4859,0.377014
2013-01-05,-0.760518,0.042109
2013-01-06,0.228507,-0.115093


In [96]:
# selecting by index and columns
df.loc['20130101': '20130103',['A', 'B']] # input is (index, columns list)

Unnamed: 0,A,B
2013-01-01,-1.249903,0.969349
2013-01-02,-0.130585,-0.913925
2013-01-03,-0.004108,0.202386
