# 10 Minutes to Pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s 

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [50]:
dates = pd.date_range("20130101", periods=6)
dates


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [51]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df


Unnamed: 0,A,B,C,D
2013-01-01,-0.303828,0.373599,0.975724,-2.17955
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-03,-1.188702,-1.498973,-0.137875,-1.881213
2013-01-04,1.027494,-0.850926,-0.727409,-1.13275
2013-01-05,-0.521499,-0.195446,0.069481,-0.589345
2013-01-06,0.398301,-0.247039,-1.165883,2.162062


In [52]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [53]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [54]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       4 non-null      float64       
 1   B       4 non-null      datetime64[ns]
 2   C       4 non-null      float32       
 3   D       4 non-null      int32         
 4   E       4 non-null      category      
 5   F       4 non-null      object        
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


In [55]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [56]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [57]:
df.to_numpy()

array([[-0.30382842,  0.37359939,  0.97572421, -2.17954956],
       [ 0.68808721,  0.67864075, -2.41076816,  0.21360203],
       [-1.18870193, -1.49897287, -0.13787466, -1.88121303],
       [ 1.02749416, -0.85092644, -0.72740853, -1.13274971],
       [-0.5214994 , -0.19544625,  0.06948117, -0.58934489],
       [ 0.39830079, -0.24703926, -1.16588329,  2.16206209]])

In [58]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.303828,0.373599,0.975724,-2.17955
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-03,-1.188702,-1.498973,-0.137875,-1.881213
2013-01-04,1.027494,-0.850926,-0.727409,-1.13275
2013-01-05,-0.521499,-0.195446,0.069481,-0.589345
2013-01-06,0.398301,-0.247039,-1.165883,2.162062


In [59]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.016642,-0.290024,-0.566122,-0.567866
std,0.832283,0.795697,1.162164,1.594061
min,-1.188702,-1.498973,-2.410768,-2.17955
25%,-0.467082,-0.699955,-1.056265,-1.694097
50%,0.047236,-0.221243,-0.432642,-0.861047
75%,0.615641,0.231338,0.017642,0.012865
max,1.027494,0.678641,0.975724,2.162062


In [60]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.303828,0.688087,-1.188702,1.027494,-0.521499,0.398301
B,0.373599,0.678641,-1.498973,-0.850926,-0.195446,-0.247039
C,0.975724,-2.410768,-0.137875,-0.727409,0.069481,-1.165883
D,-2.17955,0.213602,-1.881213,-1.13275,-0.589345,2.162062


In [61]:
df.sort_index(axis=0, ascending=True)

Unnamed: 0,A,B,C,D
2013-01-01,-0.303828,0.373599,0.975724,-2.17955
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-03,-1.188702,-1.498973,-0.137875,-1.881213
2013-01-04,1.027494,-0.850926,-0.727409,-1.13275
2013-01-05,-0.521499,-0.195446,0.069481,-0.589345
2013-01-06,0.398301,-0.247039,-1.165883,2.162062


In [62]:
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2013-01-03,-1.188702,-1.498973,-0.137875,-1.881213
2013-01-05,-0.521499,-0.195446,0.069481,-0.589345
2013-01-01,-0.303828,0.373599,0.975724,-2.17955
2013-01-06,0.398301,-0.247039,-1.165883,2.162062
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-04,1.027494,-0.850926,-0.727409,-1.13275


In [63]:
df[["A","B"]]

Unnamed: 0,A,B
2013-01-01,-0.303828,0.373599
2013-01-02,0.688087,0.678641
2013-01-03,-1.188702,-1.498973
2013-01-04,1.027494,-0.850926
2013-01-05,-0.521499,-0.195446
2013-01-06,0.398301,-0.247039


In [64]:
df[0:4]
# df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-01,-0.303828,0.373599,0.975724,-2.17955
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-03,-1.188702,-1.498973,-0.137875,-1.881213
2013-01-04,1.027494,-0.850926,-0.727409,-1.13275


In [65]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-03,-1.188702,-1.498973,-0.137875,-1.881213
2013-01-04,1.027494,-0.850926,-0.727409,-1.13275


In [66]:
df.ndim

2

In [67]:
df.shape

(6, 4)

In [68]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [71]:
df.loc[dates[1], "A"]


0.68808721042616

In [70]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.303828,0.373599,0.975724,-2.17955
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-03,-1.188702,-1.498973,-0.137875,-1.881213
2013-01-04,1.027494,-0.850926,-0.727409,-1.13275
2013-01-05,-0.521499,-0.195446,0.069481,-0.589345
2013-01-06,0.398301,-0.247039,-1.165883,2.162062


In [72]:
df.loc[dates[0]]

A   -0.303828
B    0.373599
C    0.975724
D   -2.179550
Name: 2013-01-01 00:00:00, dtype: float64

In [76]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.303828,0.373599
2013-01-02,0.688087,0.678641
2013-01-03,-1.188702,-1.498973
2013-01-04,1.027494,-0.850926
2013-01-05,-0.521499,-0.195446
2013-01-06,0.398301,-0.247039


In [77]:
df.loc["20130102":"20130104", ["A", "B"]]


Unnamed: 0,A,B
2013-01-02,0.688087,0.678641
2013-01-03,-1.188702,-1.498973
2013-01-04,1.027494,-0.850926


In [78]:
df.loc["20130102", ["A", "B"]]

A    0.688087
B    0.678641
Name: 2013-01-02 00:00:00, dtype: float64

In [79]:
df.iloc[3]

A    1.027494
B   -0.850926
C   -0.727409
D   -1.132750
Name: 2013-01-04 00:00:00, dtype: float64

In [80]:
df.at[dates[0], "A"]

-0.30382842350128264

In [81]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,1.027494,-0.850926
2013-01-05,-0.521499,-0.195446


In [82]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.303828,0.373599,0.975724,-2.17955
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-03,-1.188702,-1.498973,-0.137875,-1.881213
2013-01-04,1.027494,-0.850926,-0.727409,-1.13275
2013-01-05,-0.521499,-0.195446,0.069481,-0.589345
2013-01-06,0.398301,-0.247039,-1.165883,2.162062


In [83]:
df.iloc[[1, 2, 4], [0, 2]]


Unnamed: 0,A,C
2013-01-02,0.688087,-2.410768
2013-01-03,-1.188702,-0.137875
2013-01-05,-0.521499,0.069481


In [84]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-03,-1.188702,-1.498973,-0.137875,-1.881213


In [85]:
import seaborn as sns

In [86]:
titanic = sns.load_dataset('titanic')

In [87]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [88]:
titanic.sample(100)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
866,1,2,female,27.0,1,0,13.8583,C,Second,woman,False,,Cherbourg,yes,False
524,0,3,male,,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
608,1,2,female,22.0,1,2,41.5792,C,Second,woman,False,,Cherbourg,yes,False
724,1,1,male,27.0,1,0,53.1000,S,First,man,True,E,Southampton,yes,False
482,0,3,male,50.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
840,0,3,male,20.0,0,0,7.9250,S,Third,man,True,,Southampton,no,True
861,0,2,male,21.0,1,0,11.5000,S,Second,man,True,,Southampton,no,False
438,0,1,male,64.0,1,4,263.0000,S,First,man,True,C,Southampton,no,False
571,1,1,female,53.0,2,0,51.4792,S,First,woman,False,C,Southampton,yes,False


In [89]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.688087,0.678641,-2.410768,0.213602
2013-01-04,1.027494,-0.850926,-0.727409,-1.13275
2013-01-06,0.398301,-0.247039,-1.165883,2.162062


In [92]:
titanic[titanic["fare"]<5]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
179,0,3,male,36.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
263,0,1,male,40.0,0,0,0.0,S,First,man,True,B,Southampton,no,True
271,1,3,male,25.0,0,0,0.0,S,Third,man,True,,Southampton,yes,True
277,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
302,0,3,male,19.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
378,0,3,male,20.0,0,0,4.0125,C,Third,man,True,,Cherbourg,no,True
413,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
466,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
481,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
597,0,3,male,49.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
