# Pandas

Panda is a fast,powerful,flexible and easy to use open source data analysis and manipulation tool built on top of the Python programming language.

In [3]:
import pandas as pd 
import numpy as np

## Basic Data Structures in pandas

Series: a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc.

DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

In [6]:
df=pd.DataFrame({"Roll No":[1,2,3,],"Stu Name":['Vikas','Tarun','Abhishek']})
df

Unnamed: 0,Roll No,Stu Name
0,1,Vikas
1,2,Tarun
2,3,Abhishek


In [7]:
# Defining the type of dataframe 
type(df)

pandas.core.frame.DataFrame

## What is NaN ?
#NumPy NAN stands for not a number and is defined as a substitute for declaring value which are numerical values that are missing values in an array as NumPy is used to deal with arrays in Python and this can be initialized using numpy.

## Difference between NaN & Null
`NaN` is a special floating-point value used in numeric computations, particularly with the `numpy` and `pandas` libraries
 Null is often used in situations where the absence of a value is a valid and meaningful concept

In [10]:
#why roll number changes to float value when we replace 2 with np.nan
df=pd.DataFrame({"Roll No":[1,np.nan,3],"Stu Name":['Vikas','Tarun','Abhishek']},)
df

Unnamed: 0,Roll No,Stu Name
0,1.0,Vikas
1,,Tarun
2,3.0,Abhishek


In [11]:
# Change: data type of values in the columns changes to float 

## Why Roll No changes to float
`NaN` is a special floating-point value used in numeric computations, particularly with the `numpy` and `pandas` libraries
 Null is often used in situations where the absence of a value is a valid and meaningful concept

In [13]:
# is it posssible to convert value into int while using nan and how it is done? 
df['Roll No']=df['Roll No']
df

Unnamed: 0,Roll No,Stu Name
0,1.0,Vikas
1,,Tarun
2,3.0,Abhishek


## How to convert roll no into integer value with np.nan?
fillna(-1): Fills NaN values with -1. You can choose any placeholder value that makes sense for your context.

astype(int): Converts the column to an integer type (int64)

In [15]:
# method 1
df['Roll No']=df['Roll No'].fillna(-1).astype(int)
df

Unnamed: 0,Roll No,Stu Name
0,1,Vikas
1,-1,Tarun
2,3,Abhishek


In [16]:
# Method 2
df = pd.DataFrame({'Roll Number': [1,np.nan,3,4]})
df['Roll Number'] = df['Roll Number'].astype(pd.Int64Dtype())
df

Unnamed: 0,Roll Number
0,1.0
1,
2,3.0
3,4.0


In [17]:
# Series
ad = pd.Series([1,3,5,6])
ad

0    1
1    3
2    5
3    6
dtype: int64

In [18]:
# object creation
dates=pd.date_range("20240102",periods=4)

In [19]:
#generating dataframe with random num
sd=pd.DataFrame(np.random.randn(4,4))
sd

Unnamed: 0,0,1,2,3
0,1.220796,1.517718,2.825138,-0.812552
1,0.772713,1.262194,-1.197713,-1.282454
2,-0.099432,-0.491567,-0.893624,0.31339
3,-1.23301,0.312502,0.329755,-0.608924


In [20]:
# replacing index woth dates
sd=pd.DataFrame(np.random.randn(4,4),index=dates)
sd

Unnamed: 0,0,1,2,3
2024-01-02,-1.015675,0.862496,-0.615028,1.081127
2024-01-03,1.166231,-0.778889,-0.923816,-1.049614
2024-01-04,0.670347,1.954901,0.066601,0.196276
2024-01-05,-1.291988,-0.431571,1.413895,-0.055166


In [21]:
# replacing columns with list
sd=pd.DataFrame(np.random.randn(4,4),index=dates,columns=list("ABCD"))#columns as slot 1 ,slot2 ?
sd

Unnamed: 0,A,B,C,D
2024-01-02,-0.803491,0.954266,0.098717,-1.776829
2024-01-03,-0.03501,-0.570196,0.860101,-0.57023
2024-01-04,-0.307471,-1.070417,1.628922,0.081953
2024-01-05,1.024026,0.583492,0.081079,-0.335364


In [22]:
# Replacing column name as slot 1,slot 2,slot 3,slot 4
sd=pd.DataFrame(np.random.randn(4,4),index=dates,columns=list(["Slot 1","Slot 2","Slot 3","Slot 4"]))#columns as slot 1 ,slot2 ?
sd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-02,-0.904615,-0.619078,1.444048,1.523514
2024-01-03,-1.14365,0.484208,-0.493736,-0.147893
2024-01-04,0.338855,-0.832314,-0.534194,0.699966
2024-01-05,0.375561,-0.177729,-1.637717,0.873151


In [23]:
# converting to numpy array
sd.to_numpy()      

array([[-0.90461536, -0.61907841,  1.44404848,  1.52351351],
       [-1.14365018,  0.48420786, -0.49373647, -0.14789296],
       [ 0.33885502, -0.83231415, -0.53419378,  0.69996586],
       [ 0.37556075, -0.17772883, -1.63771666,  0.87315126]])

In [24]:
# Index of dataframe
sd.index

DatetimeIndex(['2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'], dtype='datetime64[ns]', freq='D')

In [25]:
# Columns of dataframe
sd.columns

Index(['Slot 1', 'Slot 2', 'Slot 3', 'Slot 4'], dtype='object')

## Sorting by axis and value

In [27]:
sd.sort_index(axis=1,ascending=False)

Unnamed: 0,Slot 4,Slot 3,Slot 2,Slot 1
2024-01-02,1.523514,1.444048,-0.619078,-0.904615
2024-01-03,-0.147893,-0.493736,0.484208,-1.14365
2024-01-04,0.699966,-0.534194,-0.832314,0.338855
2024-01-05,0.873151,-1.637717,-0.177729,0.375561


In [28]:
sd.sort_index(axis=0,ascending=False)

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-05,0.375561,-0.177729,-1.637717,0.873151
2024-01-04,0.338855,-0.832314,-0.534194,0.699966
2024-01-03,-1.14365,0.484208,-0.493736,-0.147893
2024-01-02,-0.904615,-0.619078,1.444048,1.523514


In [29]:
sd.sort_values(by="Slot 3",ascending=False)  # Task : sort on c and d   # default

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-02,-0.904615,-0.619078,1.444048,1.523514
2024-01-03,-1.14365,0.484208,-0.493736,-0.147893
2024-01-04,0.338855,-0.832314,-0.534194,0.699966
2024-01-05,0.375561,-0.177729,-1.637717,0.873151


In [30]:
# Task : sort on slot 3 and slot 4
sd = sd.sort_values(by=['Slot 3', 'Slot 4'], ascending=[True, True])
sd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-05,0.375561,-0.177729,-1.637717,0.873151
2024-01-04,0.338855,-0.832314,-0.534194,0.699966
2024-01-03,-1.14365,0.484208,-0.493736,-0.147893
2024-01-02,-0.904615,-0.619078,1.444048,1.523514


In [31]:
# assigning values of column A dataframe to variable dfa
dfa=sd["Slot 1"]
dfa

2024-01-05    0.375561
2024-01-04    0.338855
2024-01-03   -1.143650
2024-01-02   -0.904615
Freq: -1D, Name: Slot 1, dtype: float64

In [32]:
# select rows 0 to 2
sd[0:3]

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-05,0.375561,-0.177729,-1.637717,0.873151
2024-01-04,0.338855,-0.832314,-0.534194,0.699966
2024-01-03,-1.14365,0.484208,-0.493736,-0.147893


In [33]:
#select columns a & b
sd.loc[ : ,['Slot 1','Slot 2']]

Unnamed: 0,Slot 1,Slot 2
2024-01-05,0.375561,-0.177729
2024-01-04,0.338855,-0.832314
2024-01-03,-1.14365,0.484208
2024-01-02,-0.904615,-0.619078


In [34]:
sd.at[dates[2],"Slot 1"]   # access only single element

0.3388550188386128

In [35]:
#selecting specfic row from a column specifying given condition
filteredsd=sd[sd['Slot 1']> -1]
filteredsd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-05,0.375561,-0.177729,-1.637717,0.873151
2024-01-04,0.338855,-0.832314,-0.534194,0.699966
2024-01-02,-0.904615,-0.619078,1.444048,1.523514


In [36]:
sd["E"]=[1,2,3,4]
sd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E
2024-01-05,0.375561,-0.177729,-1.637717,0.873151,1
2024-01-04,0.338855,-0.832314,-0.534194,0.699966,2
2024-01-03,-1.14365,0.484208,-0.493736,-0.147893,3
2024-01-02,-0.904615,-0.619078,1.444048,1.523514,4


## Missing Data

In [38]:
sd["f"]=[9,np.nan,np.nan,1]
sd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,0.375561,-0.177729,-1.637717,0.873151,1,9.0
2024-01-04,0.338855,-0.832314,-0.534194,0.699966,2,
2024-01-03,-1.14365,0.484208,-0.493736,-0.147893,3,
2024-01-02,-0.904615,-0.619078,1.444048,1.523514,4,1.0


In [39]:
#drp all rows having nan
sd.dropna(how="any")

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,0.375561,-0.177729,-1.637717,0.873151,1,9.0
2024-01-02,-0.904615,-0.619078,1.444048,1.523514,4,1.0


In [86]:
# drop all nan rows with specific column having nan
sd.dropna(subset=['f'])

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,0.375561,-0.177729,-1.637717,0.873151,1,9.0
2024-01-02,-0.904615,-0.619078,1.444048,1.523514,4,1.0


In [41]:
#fill 5 in placee of na
sd.fillna(5).astype(int)

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,0,0,-1,0,1,9
2024-01-04,0,0,0,0,2,5
2024-01-03,-1,0,0,0,3,5
2024-01-02,0,0,1,1,4,1


In [42]:
# replace nan with mean value of that column
sd.fillna(sd['Slot 1'].mean())

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,0.375561,-0.177729,-1.637717,0.873151,1,9.0
2024-01-04,0.338855,-0.832314,-0.534194,0.699966,2,-0.333462
2024-01-03,-1.14365,0.484208,-0.493736,-0.147893,3,-0.333462
2024-01-02,-0.904615,-0.619078,1.444048,1.523514,4,1.0


In [43]:
#position having na value get replced by true other with false
sd.isna()

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,False,False,False,False,False,False
2024-01-04,False,False,False,False,False,True
2024-01-03,False,False,False,False,False,True
2024-01-02,False,False,False,False,False,False


In [44]:
sd.mean()

Slot 1   -0.333462
Slot 2   -0.286228
Slot 3   -0.305400
Slot 4    0.737184
E         2.500000
f         5.000000
dtype: float64

In [45]:
sd['Slot 1'].mean() # mean of a specific columns

-0.3334624435242408

In [46]:
sd.mean(axis=0)

Slot 1   -0.333462
Slot 2   -0.286228
Slot 3   -0.305400
Slot 4    0.737184
E         2.500000
f         5.000000
dtype: float64

In [47]:
# to check weather there is nan value or not
sd.isna()

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,False,False,False,False,False,False
2024-01-04,False,False,False,False,False,True
2024-01-03,False,False,False,False,False,True
2024-01-02,False,False,False,False,False,False


## Operation

In [92]:
sd.mean(axis=1)

2024-01-05    1.572211
2024-01-04    0.334463
2024-01-03    0.339786
2024-01-02    1.073978
Freq: -1D, dtype: float64

In [94]:
sd.mean(axis=0)

Slot 1   -0.333462
Slot 2   -0.286228
Slot 3   -0.305400
Slot 4    0.737184
E         2.500000
f         5.000000
dtype: float64

In [96]:
sd.mean()

Slot 1   -0.333462
Slot 2   -0.286228
Slot 3   -0.305400
Slot 4    0.737184
E         2.500000
f         5.000000
dtype: float64