<div> <h1> Pandas Library </h1>
</div>
<a href = "https://pandas.pydata.org/" > Reference</a>

<div> <h3> Pandas is a powerful and widely-used open-source library in Python for data manipulation and analysis. It provides two main data structures: Series (1-dimensional) and DataFrame (2-dimensional). 
</h3>
</div>

In [382]:
import numpy as np
import pandas as pd

In [384]:
#Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [390]:
#my_df = pd.DataFrame(data=None, index: 'Axes | None' = None, 
#                     columns: 'Axes | None' = None, dtype: 'Dtype | None' = None, 
#                    copy: 'bool | None' = None)

In [392]:
#DataFrame & List
dates = pd.date_range("20250101", periods=10)
dates

DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05', '2025-01-06', '2025-01-07', '2025-01-08',
               '2025-01-09', '2025-01-10'],
              dtype='datetime64[ns]', freq='D')

In [394]:
df = pd.DataFrame(np.random.randn(10, 3), index=dates, columns=list("XYZ"))
df

Unnamed: 0,X,Y,Z
2025-01-01,1.255237,0.745816,0.044265
2025-01-02,-0.638625,-2.288898,0.976653
2025-01-03,1.389224,-1.918962,0.257473
2025-01-04,0.549633,-0.598228,-0.904476
2025-01-05,0.232104,-0.818591,0.711248
2025-01-06,0.926906,-1.657328,-0.976052
2025-01-07,2.28209,0.178209,-1.503018
2025-01-08,1.488978,0.335247,1.229124
2025-01-09,-0.4614,-1.761327,-1.595415
2025-01-10,-1.405313,-0.263177,0.077077


In [396]:
#DataFrame & Dictionary
df2 = pd.DataFrame(
    {
        "Country": ["Iran", "USA", "UK", "Germany", "Turkey"],
        "Date": pd.Timestamp("20250102"),
        "Capital": ["Tehran", "NY", "London", "Berlin", "Istanbul"],
        "D": np.array([3] * 5, dtype="int32"),
        "Category": pd.Categorical(["test", "train", "test", "train", "Train"]),
    }
)
df2

Unnamed: 0,Country,Date,Capital,D,Category
0,Iran,2025-01-02,Tehran,3,test
1,USA,2025-01-02,NY,3,train
2,UK,2025-01-02,London,3,test
3,Germany,2025-01-02,Berlin,3,train
4,Turkey,2025-01-02,Istanbul,3,Train


In [398]:
df2.index

RangeIndex(start=0, stop=5, step=1)

In [400]:
df.columns

Index(['X', 'Y', 'Z'], dtype='object')

In [402]:
#statistic Summary
df.describe()

Unnamed: 0,X,Y,Z
count,10.0,10.0,10.0
mean,0.561884,-0.804724,-0.168312
std,1.134518,1.058946,1.018525
min,-1.405313,-2.288898,-1.595415
25%,-0.288024,-1.735327,-0.958158
50%,0.73827,-0.708409,0.060671
75%,1.355727,0.067863,0.597804
max,2.28209,0.745816,1.229124


In [410]:
df.sort_index(axis=0, ascending = False)

Unnamed: 0,X,Y,Z
2025-01-10,-1.405313,-0.263177,0.077077
2025-01-09,-0.4614,-1.761327,-1.595415
2025-01-08,1.488978,0.335247,1.229124
2025-01-07,2.28209,0.178209,-1.503018
2025-01-06,0.926906,-1.657328,-0.976052
2025-01-05,0.232104,-0.818591,0.711248
2025-01-04,0.549633,-0.598228,-0.904476
2025-01-03,1.389224,-1.918962,0.257473
2025-01-02,-0.638625,-2.288898,0.976653
2025-01-01,1.255237,0.745816,0.044265


In [412]:
df2.sort_values

<bound method DataFrame.sort_values of    Country       Date   Capital  D Category
0     Iran 2025-01-02    Tehran  3     test
1      USA 2025-01-02        NY  3    train
2       UK 2025-01-02    London  3     test
3  Germany 2025-01-02    Berlin  3    train
4   Turkey 2025-01-02  Istanbul  3    Train>

In [414]:
df.sort_values(by="X")

Unnamed: 0,X,Y,Z
2025-01-10,-1.405313,-0.263177,0.077077
2025-01-02,-0.638625,-2.288898,0.976653
2025-01-09,-0.4614,-1.761327,-1.595415
2025-01-05,0.232104,-0.818591,0.711248
2025-01-04,0.549633,-0.598228,-0.904476
2025-01-06,0.926906,-1.657328,-0.976052
2025-01-01,1.255237,0.745816,0.044265
2025-01-03,1.389224,-1.918962,0.257473
2025-01-08,1.488978,0.335247,1.229124
2025-01-07,2.28209,0.178209,-1.503018


In [416]:
# how to select a column
xx = df.X
xx

2025-01-01    1.255237
2025-01-02   -0.638625
2025-01-03    1.389224
2025-01-04    0.549633
2025-01-05    0.232104
2025-01-06    0.926906
2025-01-07    2.282090
2025-01-08    1.488978
2025-01-09   -0.461400
2025-01-10   -1.405313
Freq: D, Name: X, dtype: float64

In [418]:
type(xx)

pandas.core.series.Series

In [420]:
df[4:]

Unnamed: 0,X,Y,Z
2025-01-05,0.232104,-0.818591,0.711248
2025-01-06,0.926906,-1.657328,-0.976052
2025-01-07,2.28209,0.178209,-1.503018
2025-01-08,1.488978,0.335247,1.229124
2025-01-09,-0.4614,-1.761327,-1.595415
2025-01-10,-1.405313,-0.263177,0.077077


<div>
    <h2>Selection by label</h2>
</div>

In [423]:
i = df.loc[dates[0]]
i

X    1.255237
Y    0.745816
Z    0.044265
Name: 2025-01-01 00:00:00, dtype: float64

In [425]:
df2.loc[2]

Country                      UK
Date        2025-01-02 00:00:00
Capital                  London
D                             3
Category                   test
Name: 2, dtype: object

In [427]:
df2.loc[2:2]

Unnamed: 0,Country,Date,Capital,D,Category
2,UK,2025-01-02,London,3,test


In [429]:
df2.loc[0:2]

Unnamed: 0,Country,Date,Capital,D,Category
0,Iran,2025-01-02,Tehran,3,test
1,USA,2025-01-02,NY,3,train
2,UK,2025-01-02,London,3,test


In [431]:
df2.loc[:,'Country']

0       Iran
1        USA
2         UK
3    Germany
4     Turkey
Name: Country, dtype: object

In [433]:
df2.loc[:, ["Country","Category"]]

Unnamed: 0,Country,Category
0,Iran,test
1,USA,train
2,UK,test
3,Germany,train
4,Turkey,Train


<div>
    <h2>Selection by position</h2>
</div>

In [436]:
df.iloc[3]

X    0.549633
Y   -0.598228
Z   -0.904476
Name: 2025-01-04 00:00:00, dtype: float64

In [438]:
df2.iloc[0]

Country                    Iran
Date        2025-01-02 00:00:00
Capital                  Tehran
D                             3
Category                   test
Name: 0, dtype: object

In [440]:
df2.iloc[2:]

Unnamed: 0,Country,Date,Capital,D,Category
2,UK,2025-01-02,London,3,test
3,Germany,2025-01-02,Berlin,3,train
4,Turkey,2025-01-02,Istanbul,3,Train


In [442]:
df2.iloc[:,2]

0      Tehran
1          NY
2      London
3      Berlin
4    Istanbul
Name: Capital, dtype: object

In [444]:
df2.iloc[:,2:4]

Unnamed: 0,Capital,D
0,Tehran,3
1,NY,3
2,London,3
3,Berlin,3
4,Istanbul,3


<div>
    <h2>Boolean indexing</h2>
</div>

In [447]:
df>0

Unnamed: 0,X,Y,Z
2025-01-01,True,True,True
2025-01-02,False,False,True
2025-01-03,True,False,True
2025-01-04,True,False,False
2025-01-05,True,False,True
2025-01-06,True,False,False
2025-01-07,True,True,False
2025-01-08,True,True,True
2025-01-09,False,False,False
2025-01-10,False,False,True


In [449]:
df[df>0]

Unnamed: 0,X,Y,Z
2025-01-01,1.255237,0.745816,0.044265
2025-01-02,,,0.976653
2025-01-03,1.389224,,0.257473
2025-01-04,0.549633,,
2025-01-05,0.232104,,0.711248
2025-01-06,0.926906,,
2025-01-07,2.28209,0.178209,
2025-01-08,1.488978,0.335247,1.229124
2025-01-09,,,
2025-01-10,,,0.077077


In [451]:
df[df.X>0]

Unnamed: 0,X,Y,Z
2025-01-01,1.255237,0.745816,0.044265
2025-01-03,1.389224,-1.918962,0.257473
2025-01-04,0.549633,-0.598228,-0.904476
2025-01-05,0.232104,-0.818591,0.711248
2025-01-06,0.926906,-1.657328,-0.976052
2025-01-07,2.28209,0.178209,-1.503018
2025-01-08,1.488978,0.335247,1.229124


In [453]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,X,Y,Z,E
2025-01-01,1.255237,0.745816,0.044265,1.0
2025-01-02,-0.638625,-2.288898,0.976653,1.0
2025-01-03,1.389224,-1.918962,0.257473,
2025-01-04,0.549633,-0.598228,-0.904476,


<div> <h1> Missing Data in numpy </h1> </div>

In [456]:
#adding NaN (Null data) to the table:
df3 = df.reindex(index = dates[0:5], columns = list(df.columns)+ ["W"])
df3.iloc[0:2,3]=1
df3

Unnamed: 0,X,Y,Z,W
2025-01-01,1.255237,0.745816,0.044265,1.0
2025-01-02,-0.638625,-2.288898,0.976653,1.0
2025-01-03,1.389224,-1.918962,0.257473,
2025-01-04,0.549633,-0.598228,-0.904476,
2025-01-05,0.232104,-0.818591,0.711248,


In [458]:
df3.dropna()

Unnamed: 0,X,Y,Z,W
2025-01-01,1.255237,0.745816,0.044265,1.0
2025-01-02,-0.638625,-2.288898,0.976653,1.0


In [460]:
df3.fillna(value= 2)

Unnamed: 0,X,Y,Z,W
2025-01-01,1.255237,0.745816,0.044265,1.0
2025-01-02,-0.638625,-2.288898,0.976653,1.0
2025-01-03,1.389224,-1.918962,0.257473,2.0
2025-01-04,0.549633,-0.598228,-0.904476,2.0
2025-01-05,0.232104,-0.818591,0.711248,2.0


<div>
    <h1>Merge in pandas</h1>
</div>

In [463]:
#Concat
df4 = pd.DataFrame(np.random.randn(5, 3))
df5 = pd.DataFrame(np.random.randn(2,3))
df5

Unnamed: 0,0,1,2
0,0.566338,1.21911,0.696935
1,1.325992,-0.986288,1.222049


In [465]:
pd.concat((df4,df5),  ignore_index = True)

Unnamed: 0,0,1,2
0,0.821342,-1.752537,0.351778
1,-1.016091,0.42163,0.052094
2,0.712185,-0.957416,0.959354
3,0.074432,-0.037681,-1.15312
4,0.483733,0.649714,-0.392137
5,0.566338,1.21911,0.696935
6,1.325992,-0.986288,1.222049


In [467]:
#join --- merge()
group1 = pd.DataFrame({
    "country": ["Iran", "Turkey"],
    "group":["A","B"]            
})

group2 = pd.DataFrame({
    "country": ["Iran", "Turkey"],
    "group":["C","D"]            
})
pd.merge(group1, group2, on=("country"))

Unnamed: 0,country,group_x,group_y
0,Iran,A,C
1,Turkey,B,D


<div>
    <h1>Grouping</h1>
</div>

In [476]:
df6= pd.DataFrame({
    "group":["A", "B","c","B","A","A","C","C"],
    "gold":[1, 2, 4, 2, 3, 4, 5,6],
    "silver":[2, 3, 1, 4, 6, 7, 8, 9],
    "bronze":[6, 3, 1, 1, 3, 5, 9, 6]
}, index = dates[0:8])
df6

Unnamed: 0,group,gold,silver,bronze
2025-01-01,A,1,2,6
2025-01-02,B,2,3,3
2025-01-03,c,4,1,1
2025-01-04,B,2,4,1
2025-01-05,A,3,6,3
2025-01-06,A,4,7,5
2025-01-07,C,5,8,9
2025-01-08,C,6,9,6


In [478]:
df6.groupby("group").sum()

Unnamed: 0_level_0,gold,silver,bronze
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,8,15,14
B,4,7,4
C,11,17,15
c,4,1,1
