# Pandas tutorial

In [1347]:
                 #install libraries
#pip install pandas
#pip install numpy

In [1348]:
#import libraries
import pandas as pd
import numpy as np

In [1349]:
# Object creation
              # 'np.nan' is to create an empty cell (NaN)
s = pd.Series([1, 3, np.nan, 5, 7, 8, 9])
s

0    1.0
1    3.0
2    NaN
3    5.0
4    7.0
5    8.0
6    9.0
dtype: float64

In [1350]:
# (year-month-day)
dates = pd.date_range("20230426", periods=9)
dates

DatetimeIndex(['2023-04-26', '2023-04-27', '2023-04-28', '2023-04-29',
               '2023-04-30', '2023-05-01', '2023-05-02', '2023-05-03',
               '2023-05-04'],
              dtype='datetime64[ns]', freq='D')

In [1351]:
# convert to data-frame the previous date series
dates = pd.date_range("20230426", periods=9)
dates
df = pd.DataFrame(np.random.randn(9, 4), index=dates, columns=list("ABCD"))  #(rows, columns)
df

Unnamed: 0,A,B,C,D
2023-04-26,0.426277,-0.031553,-0.140815,-0.417875
2023-04-27,1.034757,0.369721,1.729381,0.652053
2023-04-28,0.413558,0.166273,0.23988,0.427749
2023-04-29,0.494476,-0.210542,1.344271,-0.786463
2023-04-30,1.292955,0.651874,0.05762,0.968924
2023-05-01,0.54436,-0.374813,0.228051,0.113526
2023-05-02,0.428562,0.04433,0.137393,1.281092
2023-05-03,1.448846,1.307498,-1.968851,-1.984857
2023-05-04,-0.705086,0.009656,-0.866359,-0.720216


In [1352]:
df2 = pd.DataFrame(
    {
     "A": 1.0,
     "B": pd.Timestamp("20230504"),
     "C": pd.Series(1, index=list(range(4)), dtype="float32"),
     "D": np.array([3] * 4, dtype="int32"),
     "E": pd.Categorical(["girl", "woman", "girl", "womann"]),
     "F": "females"
     }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2023-05-04,1.0,3,girl,females
1,1.0,2023-05-04,1.0,3,woman,females
2,1.0,2023-05-04,1.0,3,girl,females
3,1.0,2023-05-04,1.0,3,womann,females


In [1353]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [1354]:
df.head(2)

Unnamed: 0,A,B,C,D
2023-04-26,0.426277,-0.031553,-0.140815,-0.417875
2023-04-27,1.034757,0.369721,1.729381,0.652053


In [1355]:
df.tail(2)

Unnamed: 0,A,B,C,D
2023-05-03,1.448846,1.307498,-1.968851,-1.984857
2023-05-04,-0.705086,0.009656,-0.866359,-0.720216


In [1356]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [1357]:
# to convert dataframe(df) to numpy array
df.to_numpy()

array([[ 0.4262774 , -0.03155269, -0.14081524, -0.41787537],
       [ 1.03475661,  0.36972093,  1.72938056,  0.6520531 ],
       [ 0.41355775,  0.16627325,  0.2398803 ,  0.42774901],
       [ 0.49447617, -0.21054171,  1.34427115, -0.78646288],
       [ 1.29295523,  0.65187383,  0.05761974,  0.96892397],
       [ 0.54436029, -0.37481338,  0.22805123,  0.11352618],
       [ 0.42856156,  0.0443302 ,  0.1373926 ,  1.2810924 ],
       [ 1.44884589,  1.3074978 , -1.96885094, -1.98485691],
       [-0.70508558,  0.00965555, -0.86635908, -0.72021591]])

In [1358]:
# to convert dataframe(df2) to numpy array
df2.to_numpy()

array([[1.0, Timestamp('2023-05-04 00:00:00'), 1.0, 3, 'girl', 'females'],
       [1.0, Timestamp('2023-05-04 00:00:00'), 1.0, 3, 'woman',
        'females'],
       [1.0, Timestamp('2023-05-04 00:00:00'), 1.0, 3, 'girl', 'females'],
       [1.0, Timestamp('2023-05-04 00:00:00'), 1.0, 3, 'womann',
        'females']], dtype=object)

In [1359]:
# to view all values/data detail statistically, like mean, med, mode, sd e.t.c.
df.describe()
# &
#df2.describe()

Unnamed: 0,A,B,C,D
count,9.0,9.0,9.0,9.0
mean,0.597634,0.214716,0.084508,-0.051785
std,0.632604,0.508892,1.090417,1.026427
min,-0.705086,-0.374813,-1.968851,-1.984857
25%,0.426277,-0.031553,-0.140815,-0.720216
50%,0.494476,0.04433,0.137393,0.113526
75%,1.034757,0.369721,0.23988,0.652053
max,1.448846,1.307498,1.729381,1.281092


In [1360]:
# to transpose data(rows & columns interchange)
df.T
# &
df2.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2023-05-04 00:00:00,2023-05-04 00:00:00,2023-05-04 00:00:00,2023-05-04 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,girl,woman,girl,womann
F,females,females,females,females


In [1361]:
dates = pd.date_range("20230426", periods=9)
dates
df = pd.DataFrame(np.random.randn(9, 4), index=dates, columns=list("ABCD"))  #(rows, columns)
df

Unnamed: 0,A,B,C,D
2023-04-26,0.279159,-0.254968,0.809613,-0.736548
2023-04-27,1.084352,-0.174635,0.20315,-0.739705
2023-04-28,-0.379782,-0.525875,-0.907584,-0.572448
2023-04-29,1.226583,-0.207588,-2.042046,1.26187
2023-04-30,-0.001733,1.232367,-0.461328,1.539786
2023-05-01,-2.89628,2.036649,0.664291,-0.873529
2023-05-02,-0.245465,1.007117,-0.028038,-0.233738
2023-05-03,1.699222,0.279084,0.419497,1.180213
2023-05-04,-1.164344,-2.232611,0.828305,-0.39863


In [1362]:
                        # to sort by index
# if axis 0 or 1, ascending True is same, no change
# if axis 0, ascending False = index order changed up to down
# if axis 1, ascending False = column order changed right to left
df.sort_index(axis=0, ascending=True)
# or
#df.sort_index(axis=1, ascending=True)
# and
#df.sort_index(axis=0, ascending=False)
# or
#df.sort_index(axis=1, ascending=False)

Unnamed: 0,A,B,C,D
2023-04-26,0.279159,-0.254968,0.809613,-0.736548
2023-04-27,1.084352,-0.174635,0.20315,-0.739705
2023-04-28,-0.379782,-0.525875,-0.907584,-0.572448
2023-04-29,1.226583,-0.207588,-2.042046,1.26187
2023-04-30,-0.001733,1.232367,-0.461328,1.539786
2023-05-01,-2.89628,2.036649,0.664291,-0.873529
2023-05-02,-0.245465,1.007117,-0.028038,-0.233738
2023-05-03,1.699222,0.279084,0.419497,1.180213
2023-05-04,-1.164344,-2.232611,0.828305,-0.39863


In [1363]:
                       # to sort by values
#df.sort_values(by="C")                    # this & 2nd is same,makes "C" values -ve to +ve
df.sort_values(by="C", ascending=True)
df.sort_values(by="C", ascending=False)  # it makes a column values positive to negative

Unnamed: 0,A,B,C,D
2023-05-04,-1.164344,-2.232611,0.828305,-0.39863
2023-04-26,0.279159,-0.254968,0.809613,-0.736548
2023-05-01,-2.89628,2.036649,0.664291,-0.873529
2023-05-03,1.699222,0.279084,0.419497,1.180213
2023-04-27,1.084352,-0.174635,0.20315,-0.739705
2023-05-02,-0.245465,1.007117,-0.028038,-0.233738
2023-04-30,-0.001733,1.232367,-0.461328,1.539786
2023-04-28,-0.379782,-0.525875,-0.907584,-0.572448
2023-04-29,1.226583,-0.207588,-2.042046,1.26187


In [1364]:
# to select a specific column only
df["D"]

2023-04-26   -0.736548
2023-04-27   -0.739705
2023-04-28   -0.572448
2023-04-29    1.261870
2023-04-30    1.539786
2023-05-01   -0.873529
2023-05-02   -0.233738
2023-05-03    1.180213
2023-05-04   -0.398630
Freq: D, Name: D, dtype: float64

In [1365]:
# row wise selection
df[0 : 3]
# &
#df[1 : 3]

Unnamed: 0,A,B,C,D
2023-04-26,0.279159,-0.254968,0.809613,-0.736548
2023-04-27,1.084352,-0.174635,0.20315,-0.739705
2023-04-28,-0.379782,-0.525875,-0.907584,-0.572448


# Data selection with 'loc'

In [1366]:
# selection by labels
df.loc[dates[0]]      #  0 means first date ka data
#df.loc[dates[3:6]]   # 3 to 6 means 4th, 5th, 6th date ka data

A    0.279159
B   -0.254968
C    0.809613
D   -0.736548
Name: 2023-04-26 00:00:00, dtype: float64

In [1367]:
df.loc[:, ["A", "B", "C"]]    # ':' is rows

Unnamed: 0,A,B,C
2023-04-26,0.279159,-0.254968,0.809613
2023-04-27,1.084352,-0.174635,0.20315
2023-04-28,-0.379782,-0.525875,-0.907584
2023-04-29,1.226583,-0.207588,-2.042046
2023-04-30,-0.001733,1.232367,-0.461328
2023-05-01,-2.89628,2.036649,0.664291
2023-05-02,-0.245465,1.007117,-0.028038
2023-05-03,1.699222,0.279084,0.419497
2023-05-04,-1.164344,-2.232611,0.828305


In [1368]:
#df.loc["20230426":"20230428",["A", "B", "C"]]     # ':' means 26 se 28 date tak ka
# &
df.loc[["20230426","20230428"],["A", "B", "C"]]  #  ',' means specifically 26 & 28 ka

Unnamed: 0,A,B,C
2023-04-26,0.279159,-0.254968,0.809613
2023-04-28,-0.379782,-0.525875,-0.907584


In [1369]:
# same as above but here you selected only 1 date '28'.
df.loc["20230428", ["A", "B", "D"]]

A   -0.379782
B   -0.525875
D   -0.572448
Name: 2023-04-28 00:00:00, dtype: float64

In [1370]:
# specify a day & a column to get data detail by using "at"
df.at[dates[2], "C"]

-0.9075841905197807

In [1371]:
#Note:row last(0:4) me nhi aye gi 1 kam ayegi & column b aik kam
df.iloc[0]  # first date ki row & all columns
df.iloc[1:5] # selected rows & all columns
#df.iloc[: , :]  # all rows & all columns
#df.iloc[0:4 , :] # selected rows & all columns
#df.iloc[: , 2:4] # all rows & selected columns
df.iloc[0:4, 0:3] # mixed selection of rows&columns
#df.iloc[1:4, 1:3 ] # left side is row, right side is column always

Unnamed: 0,A,B,C
2023-04-26,0.279159,-0.254968,0.809613
2023-04-27,1.084352,-0.174635,0.20315
2023-04-28,-0.379782,-0.525875,-0.907584
2023-04-29,1.226583,-0.207588,-2.042046


In [1372]:
# selection of date by 'Boolian'
df[df["A"] > 1.5]    # to get 'column A' values only greater than 1.5

Unnamed: 0,A,B,C,D
2023-05-03,1.699222,0.279084,0.419497,1.180213


In [1373]:
# to get two column values by different choice, you could select even more
df[(df["A"] > 0) & (df["C"] < 1.3)]

Unnamed: 0,A,B,C,D
2023-04-26,0.279159,-0.254968,0.809613,-0.736548
2023-04-27,1.084352,-0.174635,0.20315,-0.739705
2023-04-29,1.226583,-0.207588,-2.042046,1.26187
2023-05-03,1.699222,0.279084,0.419497,1.180213


In [1374]:
# to view the whole data values greater than or less than
df[df > 0]
# or
#df[df < 1]     # NaN = empty cell

Unnamed: 0,A,B,C,D
2023-04-26,0.279159,,0.809613,
2023-04-27,1.084352,,0.20315,
2023-04-28,,,,
2023-04-29,1.226583,,,1.26187
2023-04-30,,1.232367,,1.539786
2023-05-01,,2.036649,0.664291,
2023-05-02,,1.007117,,
2023-05-03,1.699222,0.279084,0.419497,1.180213
2023-05-04,,,0.828305,


# Adding new column

In [1375]:
# 1
# df2 = df2 b hoskta hy,(i'm commenting out these codes to keep my data origional)

# df2 = df.copy()                 # it means k df2 k ander df copy ban k agya hy
# df2["Mahi"] = [1, 3, 5, 7, 8, 9, 7, 7, 6]      # here a column is added in df2
# df2

In [1376]:
# 2
                            # adding 1 more column
            # (i'm commenting out these codes to keep my data origional)
# df2["Hadia"] = [1, 3, 5, 7, 8, 9, 7, 7, 6]
# df2

In [1377]:
# as 2 examples above u have [A,B,C,D,Mahi,Hadia] columns, but finally u want to have
           # only [A,B,C,D,& Mahi] formula is as under;
# df2 = df2.iloc[: , 0:5]
# df2

In [1378]:
    # To take the 'mean' of [A,B,C,D] of 'df' and putting it in a new column "Mean".
df["Mean"] = df.mean(axis=1)
df

Unnamed: 0,A,B,C,D,Mean
2023-04-26,0.279159,-0.254968,0.809613,-0.736548,0.024314
2023-04-27,1.084352,-0.174635,0.20315,-0.739705,0.093291
2023-04-28,-0.379782,-0.525875,-0.907584,-0.572448,-0.596422
2023-04-29,1.226583,-0.207588,-2.042046,1.26187,0.059705
2023-04-30,-0.001733,1.232367,-0.461328,1.539786,0.577273
2023-05-01,-2.89628,2.036649,0.664291,-0.873529,-0.267217
2023-05-02,-0.245465,1.007117,-0.028038,-0.233738,0.124969
2023-05-03,1.699222,0.279084,0.419497,1.180213,0.894504
2023-05-04,-1.164344,-2.232611,0.828305,-0.39863,-0.74182


In [1379]:
# To take the 'std' of [A,B,C,D] of 'df' and putting it in a new column "Std".
df["Std"] = df.std(axis=1)
df

Unnamed: 0,A,B,C,D,Mean,Std
2023-04-26,0.279159,-0.254968,0.809613,-0.736548,0.024314,0.578479
2023-04-27,1.084352,-0.174635,0.20315,-0.739705,0.093291,0.663313
2023-04-28,-0.379782,-0.525875,-0.907584,-0.572448,-0.596422,0.193201
2023-04-29,1.226583,-0.207588,-2.042046,1.26187,0.059705,1.350519
2023-04-30,-0.001733,1.232367,-0.461328,1.539786,0.577273,0.832094
2023-05-01,-2.89628,2.036649,0.664291,-0.873529,-0.267217,1.834059
2023-05-02,-0.245465,1.007117,-0.028038,-0.233738,0.124969,0.516597
2023-05-03,1.699222,0.279084,0.419497,1.180213,0.894504,0.577402
2023-05-04,-1.164344,-2.232611,0.828305,-0.39863,-0.74182,1.11625
