# 10 minutes to pandas


#### Basic data structures:

Series : a one dimensional labeled array holding data of any type

DataFrame : a two-dimensional data structure that holds data like a two-
dimension array or a table with rows and columns


## Object Creation

In [4]:
import pandas as pd
import numpy as np

s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [8]:
dates = pd.date_range("20241210", periods=7)
print(dates)

df = pd.DataFrame(np.random.randn(7,4), index=dates, columns=list("ABCD"))
df

DatetimeIndex(['2024-12-10', '2024-12-11', '2024-12-12', '2024-12-13',
               '2024-12-14', '2024-12-15', '2024-12-16'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,A,B,C,D
2024-12-10,0.917531,0.927427,0.516586,-0.243878
2024-12-11,0.670971,-0.265463,0.973907,0.999662
2024-12-12,0.24336,-1.555428,0.295518,-0.38908
2024-12-13,0.535674,0.909836,0.387116,-0.390223
2024-12-14,-1.054933,-0.659999,1.274152,-1.36215
2024-12-15,1.47054,0.537999,-0.681301,-0.369189
2024-12-16,1.412192,0.376676,-0.357032,1.272037


In [11]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
print(df2)
df2.dtypes

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

## Viewing data

In [18]:
print(df.head())
print(df.tail(2))
print(df.index)
print(df.columns)
print(df.to_numpy())

                   A         B         C         D
2024-12-10  0.917531  0.927427  0.516586 -0.243878
2024-12-11  0.670971 -0.265463  0.973907  0.999662
2024-12-12  0.243360 -1.555428  0.295518 -0.389080
2024-12-13  0.535674  0.909836  0.387116 -0.390223
2024-12-14 -1.054933 -0.659999  1.274152 -1.362150
                   A         B         C         D
2024-12-15  1.470540  0.537999 -0.681301 -0.369189
2024-12-16  1.412192  0.376676 -0.357032  1.272037
DatetimeIndex(['2024-12-10', '2024-12-11', '2024-12-12', '2024-12-13',
               '2024-12-14', '2024-12-15', '2024-12-16'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')
[[ 0.91753108  0.92742701  0.51658615 -0.24387786]
 [ 0.67097067 -0.26546282  0.97390701  0.99966211]
 [ 0.24335956 -1.55542777  0.29551815 -0.38907975]
 [ 0.53567365  0.90983602  0.38711573 -0.39022256]
 [-1.05493291 -0.65999928  1.27415249 -1.36214969]
 [ 1.4705401   0.53799856 -0.68130073 -0.36918889]
 [ 1.41219195 

In [20]:
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,0.599333,0.038721,0.344135,-0.068974
std,0.855715,0.915986,0.687829,0.907284
min,-1.054933,-1.555428,-0.681301,-1.36215
25%,0.389517,-0.462731,-0.030757,-0.389651
50%,0.670971,0.376676,0.387116,-0.369189
75%,1.164862,0.723917,0.745247,0.377892
max,1.47054,0.927427,1.274152,1.272037


In [21]:
df.T

Unnamed: 0,2024-12-10,2024-12-11,2024-12-12,2024-12-13,2024-12-14,2024-12-15,2024-12-16
A,0.917531,0.670971,0.24336,0.535674,-1.054933,1.47054,1.412192
B,0.927427,-0.265463,-1.555428,0.909836,-0.659999,0.537999,0.376676
C,0.516586,0.973907,0.295518,0.387116,1.274152,-0.681301,-0.357032
D,-0.243878,0.999662,-0.38908,-0.390223,-1.36215,-0.369189,1.272037


In [30]:
df.sort_index(axis=0, ascending=True) #axis = 0 for index, 1 for columns

Unnamed: 0,A,B,C,D
2024-12-10,0.917531,0.927427,0.516586,-0.243878
2024-12-11,0.670971,-0.265463,0.973907,0.999662
2024-12-12,0.24336,-1.555428,0.295518,-0.38908
2024-12-13,0.535674,0.909836,0.387116,-0.390223
2024-12-14,-1.054933,-0.659999,1.274152,-1.36215
2024-12-15,1.47054,0.537999,-0.681301,-0.369189
2024-12-16,1.412192,0.376676,-0.357032,1.272037


In [35]:
df.sort_values(by="B",ascending=False)

Unnamed: 0,A,B,C,D
2024-12-10,0.917531,0.927427,0.516586,-0.243878
2024-12-13,0.535674,0.909836,0.387116,-0.390223
2024-12-15,1.47054,0.537999,-0.681301,-0.369189
2024-12-16,1.412192,0.376676,-0.357032,1.272037
2024-12-11,0.670971,-0.265463,0.973907,0.999662
2024-12-14,-1.054933,-0.659999,1.274152,-1.36215
2024-12-12,0.24336,-1.555428,0.295518,-0.38908


# Selection

 Document says they recommend .at .iat .loc and .iloc

In [37]:
# get item [] 
df["A"]

2024-12-10    0.917531
2024-12-11    0.670971
2024-12-12    0.243360
2024-12-13    0.535674
2024-12-14   -1.054933
2024-12-15    1.470540
2024-12-16    1.412192
Freq: D, Name: A, dtype: float64

In [47]:
print(df[0:3])
df["20241212":"20241215"]

                   A         B         C         D
2024-12-10  0.917531  0.927427  0.516586 -0.243878
2024-12-11  0.670971 -0.265463  0.973907  0.999662
2024-12-12  0.243360 -1.555428  0.295518 -0.389080


Unnamed: 0,A,B,C,D
2024-12-12,0.24336,-1.555428,0.295518,-0.38908
2024-12-13,0.535674,0.909836,0.387116,-0.390223
2024-12-14,-1.054933,-0.659999,1.274152,-1.36215
2024-12-15,1.47054,0.537999,-0.681301,-0.369189


In [None]:
# Selection by label
print(df.loc[dates[0]])
print(df.loc[:,["A","B"]])
print(df.loc["20241212":"20241215",["A","B"]]) #both end points included

A    0.917531
B    0.927427
C    0.516586
D   -0.243878
Name: 2024-12-10 00:00:00, dtype: float64
                   A         B
2024-12-10  0.917531  0.927427
2024-12-11  0.670971 -0.265463
2024-12-12  0.243360 -1.555428
2024-12-13  0.535674  0.909836
2024-12-14 -1.054933 -0.659999
2024-12-15  1.470540  0.537999
2024-12-16  1.412192  0.376676
                   A         B
2024-12-12  0.243360 -1.555428
2024-12-13  0.535674  0.909836
2024-12-14 -1.054933 -0.659999
2024-12-15  1.470540  0.537999


In [57]:
print(df.loc[dates[0], "A"]) #returns a scalar
print(df.at[dates[0],"A"]) # same

0.9175310777069252
0.9175310777069252


In [58]:
# Selecting by position
df.iloc[3]

A    0.535674
B    0.909836
C    0.387116
D   -0.390223
Name: 2024-12-13 00:00:00, dtype: float64

In [67]:
#integer slices acts similar to numpy
print(df.iloc[3:5, 2:4])
#list of integer position locations
print(df.iloc[[1,2,4], [0,2]])
#for slicing rows explicitly
print(df.iloc[1:3, :])
#for slicing columns explicitly
print(df.iloc[:, 1:3])
#for getting a value explicitly
print(df.iloc[2,2])
#for getting access to a scalar (same above)
print(df.iat[2,2])

                   C         D
2024-12-13  0.387116 -0.390223
2024-12-14  1.274152 -1.362150
                   A         C
2024-12-11  0.670971  0.973907
2024-12-12  0.243360  0.295518
2024-12-14 -1.054933  1.274152
                   A         B         C         D
2024-12-11  0.670971 -0.265463  0.973907  0.999662
2024-12-12  0.243360 -1.555428  0.295518 -0.389080
                   B         C
2024-12-10  0.927427  0.516586
2024-12-11 -0.265463  0.973907
2024-12-12 -1.555428  0.295518
2024-12-13  0.909836  0.387116
2024-12-14 -0.659999  1.274152
2024-12-15  0.537999 -0.681301
2024-12-16  0.376676 -0.357032
0.2955181504380651
0.2955181504380651


## Boolean indexing

In [68]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2024-12-10,0.917531,0.927427,0.516586,-0.243878
2024-12-11,0.670971,-0.265463,0.973907,0.999662
2024-12-12,0.24336,-1.555428,0.295518,-0.38908
2024-12-13,0.535674,0.909836,0.387116,-0.390223
2024-12-15,1.47054,0.537999,-0.681301,-0.369189
2024-12-16,1.412192,0.376676,-0.357032,1.272037


In [69]:
df[df < 0]

Unnamed: 0,A,B,C,D
2024-12-10,,,,-0.243878
2024-12-11,,-0.265463,,
2024-12-12,,-1.555428,,-0.38908
2024-12-13,,,,-0.390223
2024-12-14,-1.054933,-0.659999,,-1.36215
2024-12-15,,,-0.681301,-0.369189
2024-12-16,,,-0.357032,


In [76]:
# isin() filtering

df2 = df.copy()

df2["X"] = ["one", "one", "two", "three", "four", "three","five"]

print(df2)

df2[df2["X"].isin(["two", "five"])]

                   A         B         C         D      X
2024-12-10  0.917531  0.927427  0.516586 -0.243878    one
2024-12-11  0.670971 -0.265463  0.973907  0.999662    one
2024-12-12  0.243360 -1.555428  0.295518 -0.389080    two
2024-12-13  0.535674  0.909836  0.387116 -0.390223  three
2024-12-14 -1.054933 -0.659999  1.274152 -1.362150   four
2024-12-15  1.470540  0.537999 -0.681301 -0.369189  three
2024-12-16  1.412192  0.376676 -0.357032  1.272037   five


Unnamed: 0,A,B,C,D,X
2024-12-12,0.24336,-1.555428,0.295518,-0.38908,two
2024-12-16,1.412192,0.376676,-0.357032,1.272037,five


## Setting

In [81]:
s1 = pd.Series([1, 2, 3, 4, 5, 6,7], index=pd.date_range("20241210", periods=7))
print(s1)

df["F"] = s1
df

2024-12-10    1
2024-12-11    2
2024-12-12    3
2024-12-13    4
2024-12-14    5
2024-12-15    6
2024-12-16    7
Freq: D, dtype: int64


Unnamed: 0,A,B,C,D,F
2024-12-10,0.917531,0.927427,0.516586,-0.243878,1
2024-12-11,0.670971,-0.265463,0.973907,0.999662,2
2024-12-12,0.24336,-1.555428,0.295518,-0.38908,3
2024-12-13,0.535674,0.909836,0.387116,-0.390223,4
2024-12-14,-1.054933,-0.659999,1.274152,-1.36215,5
2024-12-15,1.47054,0.537999,-0.681301,-0.369189,6
2024-12-16,1.412192,0.376676,-0.357032,1.272037,7


In [82]:
# Setting values by label
df.at[dates[0], "A"] = 0
df 

Unnamed: 0,A,B,C,D,F
2024-12-10,0.0,0.927427,0.516586,-0.243878,1
2024-12-11,0.670971,-0.265463,0.973907,0.999662,2
2024-12-12,0.24336,-1.555428,0.295518,-0.38908,3
2024-12-13,0.535674,0.909836,0.387116,-0.390223,4
2024-12-14,-1.054933,-0.659999,1.274152,-1.36215,5
2024-12-15,1.47054,0.537999,-0.681301,-0.369189,6
2024-12-16,1.412192,0.376676,-0.357032,1.272037,7


In [85]:
# Setting values by position
df.iat[0,2] = 0
df 

Unnamed: 0,A,B,C,D,F
2024-12-10,0.0,0.927427,0.0,-0.243878,1
2024-12-11,0.670971,-0.265463,0.973907,0.999662,2
2024-12-12,0.24336,-1.555428,0.295518,-0.38908,3
2024-12-13,0.535674,0.909836,0.387116,-0.390223,4
2024-12-14,-1.054933,-0.659999,1.274152,-1.36215,5
2024-12-15,1.47054,0.537999,-0.681301,-0.369189,6
2024-12-16,1.412192,0.376676,-0.357032,1.272037,7


In [87]:
# Setting by assigning with a Numpy array
df.loc[:,"D"] = np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2024-12-10,0.0,0.927427,0.0,5.0,1
2024-12-11,0.670971,-0.265463,0.973907,5.0,2
2024-12-12,0.24336,-1.555428,0.295518,5.0,3
2024-12-13,0.535674,0.909836,0.387116,5.0,4
2024-12-14,-1.054933,-0.659999,1.274152,5.0,5
2024-12-15,1.47054,0.537999,-0.681301,5.0,6
2024-12-16,1.412192,0.376676,-0.357032,5.0,7


In [89]:
# a "where" operation with setting
df2 = df.copy()
df2[df2 > 0] = -df2
df2 

Unnamed: 0,A,B,C,D,F
2024-12-10,0.0,-0.927427,0.0,-5.0,-1
2024-12-11,-0.670971,-0.265463,-0.973907,-5.0,-2
2024-12-12,-0.24336,-1.555428,-0.295518,-5.0,-3
2024-12-13,-0.535674,-0.909836,-0.387116,-5.0,-4
2024-12-14,-1.054933,-0.659999,-1.274152,-5.0,-5
2024-12-15,-1.47054,-0.537999,-0.681301,-5.0,-6
2024-12-16,-1.412192,-0.376676,-0.357032,-5.0,-7


# Missing Data