# Pandas

Panda is a fast,powerful,flexible and easy to use open source data analysis and manipulation tool built on top of the Python programming language.

In [3]:
import pandas as pd 
import numpy as np

## Basic Data Structures in pandas

Series: a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc.

DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

In [6]:
df=pd.DataFrame({"Roll No":[1,2,3,],"Stu Name":['Vikas','Tarun','Abhishek']})
df

Unnamed: 0,Roll No,Stu Name
0,1,Vikas
1,2,Tarun
2,3,Abhishek


In [7]:
# Defining the type of dataframe 
type(df)

pandas.core.frame.DataFrame

## What is NaN ?
#NumPy NAN stands for not a number and is defined as a substitute for declaring value which are numerical values that are missing values in an array as NumPy is used to deal with arrays in Python and this can be initialized using numpy.

## Difference between NaN & Null
`NaN` is a special floating-point value used in numeric computations, particularly with the `numpy` and `pandas` libraries
 Null is often used in situations where the absence of a value is a valid and meaningful concept

In [10]:
#why roll number changes to float value when we replace 2 with np.nan
df=pd.DataFrame({"Roll No":[1,np.nan,3],"Stu Name":['Vikas','Tarun','Abhishek']},)
df

Unnamed: 0,Roll No,Stu Name
0,1.0,Vikas
1,,Tarun
2,3.0,Abhishek


In [11]:
# Change: data type of values in the columns changes to float 

## Why Roll No changes to float
`NaN` is a special floating-point value used in numeric computations, particularly with the `numpy` and `pandas` libraries
 Null is often used in situations where the absence of a value is a valid and meaningful concept

In [13]:
# is it posssible to convert value into int while using nan and how it is done? 
df['Roll No']=df['Roll No']
df

Unnamed: 0,Roll No,Stu Name
0,1.0,Vikas
1,,Tarun
2,3.0,Abhishek


## How to convert roll no into integer value with np.nan?
fillna(-1): Fills NaN values with -1. You can choose any placeholder value that makes sense for your context.

astype(int): Converts the column to an integer type (int64)

In [15]:
# method 1
df['Roll No']=df['Roll No'].fillna(-1).astype(int)
df

Unnamed: 0,Roll No,Stu Name
0,1,Vikas
1,-1,Tarun
2,3,Abhishek


In [16]:
# Method 2
df = pd.DataFrame({'Roll Number': [1,np.nan,3,4]})
df['Roll Number'] = df['Roll Number'].astype(pd.Int64Dtype())
df

Unnamed: 0,Roll Number
0,1.0
1,
2,3.0
3,4.0


In [17]:
# Series
ad = pd.Series([1,3,5,6])
ad

0    1
1    3
2    5
3    6
dtype: int64

In [18]:
# object creation
dates=pd.date_range("20240102",periods=4)

In [19]:
#generating dataframe with random num
sd=pd.DataFrame(np.random.randn(4,4))
sd

Unnamed: 0,0,1,2,3
0,1.553545,-0.110319,0.869069,-0.224155
1,0.262812,1.186949,0.182841,-1.289559
2,2.187989,-0.636968,-0.033078,0.397475
3,0.846511,-2.169645,-0.474995,-1.18288


In [20]:
# replacing index woth dates
sd=pd.DataFrame(np.random.randn(4,4),index=dates)
sd

Unnamed: 0,0,1,2,3
2024-01-02,-0.633258,-2.205597,0.632384,0.542068
2024-01-03,0.835006,-0.994204,-1.662839,1.136523
2024-01-04,0.077382,-0.081938,-0.726341,0.134286
2024-01-05,1.529937,-0.609812,1.211552,-1.849086


In [21]:
# replacing columns with list
sd=pd.DataFrame(np.random.randn(4,4),index=dates,columns=list("ABCD"))#columns as slot 1 ,slot2 ?
sd

Unnamed: 0,A,B,C,D
2024-01-02,-0.40411,-1.079522,-1.179314,-0.529707
2024-01-03,-0.120897,-0.839138,1.541415,-0.697941
2024-01-04,-0.657538,-0.676525,-0.570827,0.765936
2024-01-05,2.512118,-1.362085,-0.001896,0.482641


In [22]:
# Replacing column name as slot 1,slot 2,slot 3,slot 4
sd=pd.DataFrame(np.random.randn(4,4),index=dates,columns=list(["Slot 1","Slot 2","Slot 3","Slot 4"]))#columns as slot 1 ,slot2 ?
sd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-02,0.756337,0.799143,-0.007637,0.414511
2024-01-03,-0.149036,-1.428812,0.456903,0.617605
2024-01-04,-0.786773,-1.47539,1.498043,0.014304
2024-01-05,-0.404927,0.05074,-1.367361,0.427768


In [23]:
# converting to numpy array
sd.to_numpy()      

array([[ 0.75633691,  0.79914274, -0.00763691,  0.41451074],
       [-0.14903615, -1.42881239,  0.45690329,  0.61760519],
       [-0.7867732 , -1.47538958,  1.49804289,  0.01430449],
       [-0.40492707,  0.05074038, -1.36736135,  0.42776759]])

In [24]:
# Index of dataframe
sd.index

DatetimeIndex(['2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'], dtype='datetime64[ns]', freq='D')

In [25]:
# Columns of dataframe
sd.columns

Index(['Slot 1', 'Slot 2', 'Slot 3', 'Slot 4'], dtype='object')

## Sorting by axis and value

In [27]:
sd.sort_index(axis=1,ascending=False)

Unnamed: 0,Slot 4,Slot 3,Slot 2,Slot 1
2024-01-02,0.414511,-0.007637,0.799143,0.756337
2024-01-03,0.617605,0.456903,-1.428812,-0.149036
2024-01-04,0.014304,1.498043,-1.47539,-0.786773
2024-01-05,0.427768,-1.367361,0.05074,-0.404927


In [28]:
sd.sort_index(axis=0,ascending=False)

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-05,-0.404927,0.05074,-1.367361,0.427768
2024-01-04,-0.786773,-1.47539,1.498043,0.014304
2024-01-03,-0.149036,-1.428812,0.456903,0.617605
2024-01-02,0.756337,0.799143,-0.007637,0.414511


In [29]:
sd.sort_values(by="Slot 3",ascending=False)  # Task : sort on c and d   # default

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-04,-0.786773,-1.47539,1.498043,0.014304
2024-01-03,-0.149036,-1.428812,0.456903,0.617605
2024-01-02,0.756337,0.799143,-0.007637,0.414511
2024-01-05,-0.404927,0.05074,-1.367361,0.427768


In [30]:
# Task : sort on slot 3 and slot 4
sd = sd.sort_values(by=['Slot 3', 'Slot 4'], ascending=[True, True])
sd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-05,-0.404927,0.05074,-1.367361,0.427768
2024-01-02,0.756337,0.799143,-0.007637,0.414511
2024-01-03,-0.149036,-1.428812,0.456903,0.617605
2024-01-04,-0.786773,-1.47539,1.498043,0.014304


In [31]:
# assigning values of column A dataframe to variable dfa
dfa=sd["Slot 1"]
dfa

2024-01-05   -0.404927
2024-01-02    0.756337
2024-01-03   -0.149036
2024-01-04   -0.786773
Name: Slot 1, dtype: float64

In [32]:
# select rows 0 to 2
sd[0:3]

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-05,-0.404927,0.05074,-1.367361,0.427768
2024-01-02,0.756337,0.799143,-0.007637,0.414511
2024-01-03,-0.149036,-1.428812,0.456903,0.617605


In [33]:
#select columns a & b
sd.loc[ : ,['Slot 1','Slot 2']]

Unnamed: 0,Slot 1,Slot 2
2024-01-05,-0.404927,0.05074
2024-01-02,0.756337,0.799143
2024-01-03,-0.149036,-1.428812
2024-01-04,-0.786773,-1.47539


In [34]:
sd.at[dates[2],"Slot 1"]   # access only single element

-0.786773196513896

In [35]:
#selecting specfic row from a column specifying given condition
filteredsd=sd[sd['Slot 1']> -1]
filteredsd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4
2024-01-05,-0.404927,0.05074,-1.367361,0.427768
2024-01-02,0.756337,0.799143,-0.007637,0.414511
2024-01-03,-0.149036,-1.428812,0.456903,0.617605
2024-01-04,-0.786773,-1.47539,1.498043,0.014304


In [36]:
sd["E"]=[1,2,3,4]
sd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E
2024-01-05,-0.404927,0.05074,-1.367361,0.427768,1
2024-01-02,0.756337,0.799143,-0.007637,0.414511,2
2024-01-03,-0.149036,-1.428812,0.456903,0.617605,3
2024-01-04,-0.786773,-1.47539,1.498043,0.014304,4


## Missing Data

In [38]:
sd["f"]=[9,np.nan,np.nan,1]
sd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,-0.404927,0.05074,-1.367361,0.427768,1,9.0
2024-01-02,0.756337,0.799143,-0.007637,0.414511,2,
2024-01-03,-0.149036,-1.428812,0.456903,0.617605,3,
2024-01-04,-0.786773,-1.47539,1.498043,0.014304,4,1.0


In [39]:
#drp all rows having nan
sd.dropna(how="any")

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,-0.404927,0.05074,-1.367361,0.427768,1,9.0
2024-01-04,-0.786773,-1.47539,1.498043,0.014304,4,1.0


In [40]:
# drop all nan rows with specific column having nan
sd.dropna(subset=['f'])

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,-0.404927,0.05074,-1.367361,0.427768,1,9.0
2024-01-04,-0.786773,-1.47539,1.498043,0.014304,4,1.0


In [41]:
#fill 5 in placee of na
sd.fillna(5).astype(int)

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,0,0,-1,0,1,9
2024-01-02,0,0,0,0,2,5
2024-01-03,0,-1,0,0,3,5
2024-01-04,0,-1,1,0,4,1


In [42]:
# replace nan with mean value of that column
sd.fillna(sd['Slot 1'].mean())

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,-0.404927,0.05074,-1.367361,0.427768,1,9.0
2024-01-02,0.756337,0.799143,-0.007637,0.414511,2,-0.1461
2024-01-03,-0.149036,-1.428812,0.456903,0.617605,3,-0.1461
2024-01-04,-0.786773,-1.47539,1.498043,0.014304,4,1.0


In [43]:
#position having na value get replced by true other with false
sd.isna()

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,False,False,False,False,False,False
2024-01-02,False,False,False,False,False,True
2024-01-03,False,False,False,False,False,True
2024-01-04,False,False,False,False,False,False


In [44]:
sd.mean()

Slot 1   -0.146100
Slot 2   -0.513580
Slot 3    0.144987
Slot 4    0.368547
E         2.500000
f         5.000000
dtype: float64

In [45]:
sd['Slot 1'].mean() # mean of a specific columns

-0.1460998754621612

In [46]:
sd.mean(axis=0)

Slot 1   -0.146100
Slot 2   -0.513580
Slot 3    0.144987
Slot 4    0.368547
E         2.500000
f         5.000000
dtype: float64

In [47]:
# to check weather there is nan value or not
sd.isna()

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,False,False,False,False,False,False
2024-01-02,False,False,False,False,False,True
2024-01-03,False,False,False,False,False,True
2024-01-04,False,False,False,False,False,False


## Operation

In [49]:
sd.mean(axis=1)

2024-01-05    1.451037
2024-01-02    0.792471
2024-01-03    0.499332
2024-01-04    0.708364
dtype: float64

In [50]:
sd.mean(axis=0)

Slot 1   -0.146100
Slot 2   -0.513580
Slot 3    0.144987
Slot 4    0.368547
E         2.500000
f         5.000000
dtype: float64

In [51]:
sd.mean()

Slot 1   -0.146100
Slot 2   -0.513580
Slot 3    0.144987
Slot 4    0.368547
E         2.500000
f         5.000000
dtype: float64

## Merge
Concat
pandas provides various facilities for easily combining together Series and DataFrame objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations.

In [53]:
sd

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,-0.404927,0.05074,-1.367361,0.427768,1,9.0
2024-01-02,0.756337,0.799143,-0.007637,0.414511,2,
2024-01-03,-0.149036,-1.428812,0.456903,0.617605,3,
2024-01-04,-0.786773,-1.47539,1.498043,0.014304,4,1.0


In [97]:
pieces = [sd[:3], sd[2:4], sd[3:]]
pd.concat(pieces)

Unnamed: 0,Slot 1,Slot 2,Slot 3,Slot 4,E,f
2024-01-05,-0.404927,0.05074,-1.367361,0.427768,1,9.0
2024-01-02,0.756337,0.799143,-0.007637,0.414511,2,
2024-01-03,-0.149036,-1.428812,0.456903,0.617605,3,
2024-01-03,-0.149036,-1.428812,0.456903,0.617605,3,
2024-01-04,-0.786773,-1.47539,1.498043,0.014304,4,1.0
2024-01-04,-0.786773,-1.47539,1.498043,0.014304,4,1.0
