In [None]:
!pip install pandas



In [None]:
import pandas as pd
import numpy as np

#1. Working with Pandas Series

#### a) Creating Series

A Pandas Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floats, etc.). It is similar to a column in a spreadsheet or a database table, with labels (called the index) associated with each value, allowing for intuitive data alignment and access. Series are foundational in pandas, serving as the building blocks for more complex data structures like DataFrames.

**Series Through list**

In [None]:
pd.__version__

'2.2.2'

In [None]:
lst = [1,2,3,4,5]
print(pd.Series(lst))

0    1
1    2
2    3
3    4
4    5
dtype: int64


**Series through Numpy Array**

In [None]:
arr = np.array([1,2,3,4,5])

print(pd.Series(arr))


0    1
1    2
2    3
3    4
4    5
dtype: int64


**Giving Index from my end**

In [None]:
print(pd.Series(index = [1,2,3,4,5], data = ['Ainadri', 'Ana', 'Soumya', 'Subhankar', 'Ritesh']))

1      Ainadri
2          Ana
3       Soumya
4    Subhankar
5       Ritesh
dtype: object


**Series Through dictionary values**

In [None]:
steps = {'day_1': 4000, 'day_2': 4200,'day_3': 4400,'day_4': 4700}
print(pd.Series(steps))

day_1    4000
day_2    4200
day_3    4400
day_4    4700
dtype: int64


**Using `repeat()` function along with creating a series**

In [None]:
pd.Series(5).repeat(3)

Unnamed: 0,0
0,5
0,5
0,5


We can use the `reset()` function to make the index accurate

In [None]:
pd.Series(5).repeat(3).reset_index(drop = True)  #the drop=True is used to discard the old index after calling reset_index().



Unnamed: 0,0
0,5
1,5
2,5


In [None]:
s = pd.Series([10,20]).repeat([5,3]).reset_index(drop = True)
s

Unnamed: 0,0
0,10
1,10
2,10
3,10
4,10
5,20
6,20
7,20


**Accesing elements**

In [None]:
print(s[0])

10


In [None]:
print(s[-1:])

7    20
dtype: int64


#### b) Aggregate function on pandas Series

Aggregate functions on a Pandas Series perform summary computations, such as calculating statistics like sum, mean, median, min, max, count, and standard deviation.

In [None]:
sr = pd.Series([1,2,3,4,5,6,7])
print(sr.agg(['min', 'max', 'sum', 'mean', 'median','count']))

min        1.0
max        7.0
sum       28.0
mean       4.0
median     4.0
count      7.0
dtype: float64


#### c) Series absolute function

The absolute function on a Pandas Series, accessed using .abs(), returns a new Series with the absolute (non-negative) values of each element. It is commonly used to eliminate negative signs from numerical data, enabling easier comparison, analysis, or aggregation without regard to direction.



In [None]:
sr = pd.Series([1,-2,-3,-4,-5,6,7])
print(sr.abs())

0    1
1    2
2    3
3    4
4    5
5    6
6    7
dtype: int64


#### d) Concatenating Series

Appending in Pandas Series is the process of combining two or more Series objects into a single Series, typically done using the `pd.concat()`. It preserves the original indices unless reset, allowing for flexible data stacking.

Syntax: `pd.concat([series1, series2])`

In [None]:
sr1 = pd.Series([1,2,3,4,5,6,7])
sr2 = pd.Series([1,-2,-5,6,7])
print(pd.concat([sr1,sr2]).reset_index(drop = True))

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     1
8    -2
9    -5
10    6
11    7
dtype: int64


#### e) Astype function

The `.astype()` function in Pandas is used to convert the data type of a Series to a specified type, such as int, float, str, or even custom types.

Syntax: `series.astype(dtype)`

In [None]:
print(type(sr1[1]))

<class 'numpy.int64'>


In [None]:
sr2 = pd.Series([1,-2,-5,6,7])

sr2.astype('float')

Unnamed: 0,0
0,1.0
1,-2.0
2,-5.0
3,6.0
4,7.0


#### f) Between function
The `.between()` function in Pandas is used to check whether each element in a Series lies between two boundary values, inclusive by default. It returns a Boolean Series

Syntax: `series.between(lower, upper, inclusive='both')`

* `lower`: lower bound

* `upper`: upper bound

* `inclusive`: `'both'` (default), `'left'`, `'right'`, or `'neither'`

In [None]:
sr1 = pd.Series([1,2,3,4,5,9])
sr1.between(4,11)

Unnamed: 0,0
0,False
1,False
2,False
3,True
4,True
5,True


#### g) All string functions can be used to extract or modify texts in a series

* Upper and Lower Function
* Len Function
* Strip Function
* Split Function
* Contains Function
* Replace Function
* Count Function
* Stratswith and Endswith Function
* Find Function

In [None]:
ser = pd.Series(['  Eshant Das  ', '  Data Science  ', '  Hello World  ', '  Artificial Intelligence    ', '   Machine Learning  '])


Upper and Lower Function

In [None]:
print(ser.str.upper())

print('-' * 30 )

print(ser.str.lower())

0                     ESHANT DAS  
1                   DATA SCIENCE  
2                    HELLO WORLD  
3      ARTIFICIAL INTELLIGENCE    
4               MACHINE LEARNING  
dtype: object
------------------------------
0                     eshant das  
1                   data science  
2                    hello world  
3      artificial intelligence    
4               machine learning  
dtype: object


`len()`

In [None]:
for i in ser:
  print(i , len(i))


  Eshant Das   14
  Data Science   16
  Hello World   15
  Artificial Intelligence     29
   Machine Learning   21


`strip()`

In [None]:
print(ser)

0                     Eshant Das  
1                   Data Science  
2                    Hello World  
3      Artificial Intelligence    
4               Machine Learning  
dtype: object


In [None]:
print(ser.str.strip())

0                 Eshant Das
1               Data Science
2                Hello World
3    Artificial Intelligence
4           Machine Learning
dtype: object


`split()`

In [None]:
print(pd.Series(['10/3/1983', '10/4/1994', '29/10/1998']))

print('-' * 30 )

print(pd.Series(['10/3/1983', '10/4/1994', '29/10/1998']).str.split())

print('-' * 30 )

print(pd.Series(['10/3/1983', '10/4/1994', '29/10/1998']).str.split('/'))



0     10/3/1983
1     10/4/1994
2    29/10/1998
dtype: object
------------------------------
0     [10/3/1983]
1     [10/4/1994]
2    [29/10/1998]
dtype: object
------------------------------
0     [10, 3, 1983]
1     [10, 4, 1994]
2    [29, 10, 1998]
dtype: object


In [None]:
ser.str.split()

Unnamed: 0,0
0,"[Eshant, Das]"
1,"[Data, Science]"
2,"[Hello, World]"
3,"[Artificial, Intelligence]"
4,"[Machine, Learning]"


`contains()`

In [None]:
ser = pd.Series(['   Eshant Das', 'Data Science', 'Hello World', 'Artificial Intelligence', 'Machine Learning'])
ser.str.contains("i")

Unnamed: 0,0
0,False
1,True
2,False
3,True
4,True


`replace()`

In [None]:
ser.str.replace('i','*')

Unnamed: 0,0
0,Eshant Das
1,Data Sc*ence
2,Hello World
3,Art*f*c*al Intell*gence
4,Mach*ne Learn*ng


`count()`

In [None]:
ser.str.count('a')

Unnamed: 0,0
0,2
1,2
2,0
3,1
4,2


`startswith()` and `endswith()`

In [None]:
ser.str.endswith('ce')

Unnamed: 0,0
0,False
1,True
2,False
3,True
4,False


In [None]:
ser.str.startswith('A')

Unnamed: 0,0
0,False
1,False
2,False
3,True
4,False


`find()`

In [None]:
ser = pd.Series(['   Eshant Das', 'Data Science', 'Hello World', 'Artificial Intelligence', 'Machine Learning'])

ser.str.find('a')

Unnamed: 0,0
0,6
1,1
2,-1
3,8
4,1


#### h) Converting a Series to List

In [None]:
ser.to_list()

['   Eshant Das',
 'Data Science',
 'Hello World',
 'Artificial Intelligence',
 'Machine Learning']

# 2. Pandas Dataframe

#### a) Creating Data Frames

Creating a dataframe using a list

In [None]:
lst = ['Ainadri', 'Mandal', 'Subhankar', 'Saha', 'Soumya', 'Panda']

pd.DataFrame(lst)

Unnamed: 0,0
0,Ainadri
1,Mandal
2,Subhankar
3,Saha
4,Soumya
5,Panda


In [None]:
lst = [['tom', 10],['jerry', 12], ['spike', 14]]
pd.DataFrame(lst)

Unnamed: 0,0,1
0,tom,10
1,jerry,12
2,spike,14


Creating DataFrame from dict of ndarray/lists:

In [None]:
data = {'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28, 34, 29, 42]}
pd.DataFrame(data)

Unnamed: 0,Name,Age
0,Tom,28
1,Jack,34
2,Steve,29
3,Ricky,42


How to make it multidimensional

In [None]:
data = {'Name': ['Tom', 'Jack', 'Steve', 'Ricky'],
        'Age': [28, 34, 29, 42],
        'Gender': ['M', 'M', 'M', 'F'],
        'City': ['Delhi', 'Mumbai', 'Goa', 'Kerala'],
        'Qualification': ['MSc', 'MA', 'MCA', 'Phd']}
df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Gender,City,Qualification
0,Tom,28,M,Delhi,MSc
1,Jack,34,M,Mumbai,MA
2,Steve,29,M,Goa,MCA
3,Ricky,42,F,Kerala,Phd


In [None]:
df[['Name', 'Age', 'City']]

Unnamed: 0,Name,Age,City
0,Tom,28,Delhi
1,Jack,34,Mumbai
2,Steve,29,Goa
3,Ricky,42,Kerala


#### **b) Slicing in DataFrames Using iloc and loc**

In [None]:
data = {'one': pd.Series([1,2,3,4]),
        'two': pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four': pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df.index = ['a', 'b', 'c', 'd']
df

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


#### **Basic `loc[]` Operations**

`.loc[]` is used to get or change data in a DataFrame using the row and column names (labels), not numbers.

 **Syntax** : `df.loc[row_label, column_label]`

In [None]:
df.loc['a':'c']

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000


In [None]:
df.loc['a':]

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


In [None]:
df.loc['a': 'c', 'one']

Unnamed: 0,one
a,1
b,2
c,3


In [None]:
df.loc['a': , ['one','three']]

Unnamed: 0,one,three
a,1,100
b,2,200
c,3,300
d,4,400


In [None]:
df.loc['a': , 'one': 'three']

Unnamed: 0,one,two,three
a,1,10,100
b,2,20,200
c,3,30,300
d,4,40,400


#### **Basic `iloc[]` Operations**

`.loc[]` is used to get or change data in a DataFrame using the row and column names (labels), not numbers.

**Syntax**: `DataFrame.iloc[row_position, column_position]`

**Note** : last index is inclusive unlike `loc[]`

In [None]:
df

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


In [None]:
df.iloc[1:2]

Unnamed: 0,one,two,three,four
b,2,20,200,2000


In [None]:
df.iloc[1: , 2: ]

Unnamed: 0,three,four
b,200,2000
c,300,3000
d,400,4000


In [None]:
df.iloc[ : , 2:3]

Unnamed: 0,three
a,100
b,200
c,300
d,400


In [None]:
df.iloc[[0,2]]

Unnamed: 0,one,two,three,four
a,1,10,100,1000
c,3,30,300,3000


In [None]:
df.iloc[[0,2],[0,2]]

Unnamed: 0,one,three
a,1,100
c,3,300


#### **c) Slicing Using conditions**



In [None]:
df['two'] > 20

Unnamed: 0,two
a,False
b,False
c,True
d,True


In [None]:
df[df['two']>20]

Unnamed: 0,one,two,three,four
c,3,30,300,3000
d,4,40,400,4000


In [None]:
df.loc[df['two']> 20, ['three', 'four']]

Unnamed: 0,three,four
c,300,3000
d,400,4000


In [None]:
df

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


In [None]:
df.loc[df['three'] > 200, ['one', 'three', 'four']]

Unnamed: 0,one,three,four
c,3,300,3000
d,4,400,4000


In [None]:
df.loc[(df['three'] > 200) & (df['two']>30)]

Unnamed: 0,one,two,three,four
d,4,40,400,4000


#### **c) Adding new Column in DataFrame**

In [None]:
df

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


**Using a list**

In [None]:
# As we need four values for the new column

l = [12, 34, 56, 78]
df['five'] = l

df

Unnamed: 0,one,two,three,four,five
a,1,10,100,1000,12
b,2,20,200,2000,34
c,3,30,300,3000,56
d,4,40,400,4000,78


**Using a series**

In [None]:
sr = pd.Series([111,222,333,444])

df['six'] = sr.values

df

Unnamed: 0,one,two,three,four,five,six
a,1,10,100,1000,12,111
b,2,20,200,2000,34,222
c,3,30,300,3000,56,333
d,4,40,400,4000,78,444


**Using an existing Column**

Like incrementing values of column by 10

In [None]:
df

Unnamed: 0,one,two,three,four,five,six
a,1,10,100,1000,12,111
b,2,20,200,2000,34,222
c,3,30,300,3000,56,333
d,4,40,400,4000,78,444


In [None]:
df['seven'] = df['two'] + 10

df

Unnamed: 0,one,two,three,four,five,six,seven
a,1,10,100,1000,12,111,20
b,2,20,200,2000,34,222,30
c,3,30,300,3000,56,333,40
d,4,40,400,4000,78,444,50


#### **d) Column Deletion**

Two ways:


*   Using del





In [None]:
del df['six']

In [None]:
df

Unnamed: 0,one,two,three,four,five,seven
a,1,10,100,1000,12,20
b,2,20,200,2000,34,30
c,3,30,300,3000,56,40
d,4,40,400,4000,78,50


*   Using pop

In [None]:
df.pop('seven')

Unnamed: 0,seven
a,20
b,30
c,40
d,50


In [None]:
df

Unnamed: 0,one,two,three,four,five
a,1,10,100,1000,12
b,2,20,200,2000,34
c,3,30,300,3000,56
d,4,40,400,4000,78


#### **e) Addition of rows**

We can add rows in DataFrame using concat method.

In [None]:
df1 = pd.DataFrame([[1,2],[3,4]], columns = ['a','b'])
df2 = pd.DataFrame([[5,6],[7,8]], columns = ['a','b'])

df = pd.concat([df1, df2]).reset_index(drop = True)
df

Unnamed: 0,a,b
0,1,2
1,3,4
2,5,6
3,7,8


#### **f) Pandas Drop Function**

In [None]:
data = {'one': pd.Series([1,2,3,4]),
        'two': pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four': pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


* **axis = 0 => Rows (row- wise)**

In [None]:
df.drop([0,1], axis = 0, inplace = True)
df
#inplace is used because it makes the change in df otherwise no change will be seen

Unnamed: 0,one,two,three,four
2,3,30,300,3000
3,4,40,400,4000


* **axis = 1 => Columns (column- wise)**

In [None]:
df.drop(['one','three'], axis = 1, inplace = True)
df

Unnamed: 0,two,four
2,30,3000
3,40,4000


#### **g) Transposing a data frame**

In [None]:
data = {'one': pd.Series([1,2,3,4]),
        'two': pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four': pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [None]:
df.T

Unnamed: 0,0,1,2,3
one,1,2,3,4
two,10,20,30,40
three,100,200,300,400
four,1000,2000,3000,4000


#### **h) A set for more DataFrame functionalities.**

In [None]:
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


#### 1. axes function

The `.axes` attribute in a Pandas DataFrame **returns** a list with the **row** and **column labels** of the DataFrame. The first element of the list is the row labels (index), and the second element is the column labels.

In [85]:
df.axes

[RangeIndex(start=0, stop=4, step=1),
 Index(['one', 'two', 'three', 'four'], dtype='object')]

#### 2. ndim Function

The `.ndim` attribute in a Pandas DataFrame returns the number of dimensions of the dataframe, which is always 2 for DataFrame (row-column-format).

In [86]:
df.ndim

2

#### 3. `dtypes`

In [87]:
df.dtypes

Unnamed: 0,0
one,int64
two,int64
three,int64
four,int64


#### 4. shape function

In [88]:
df.shape

(4, 4)

In [91]:
d = {'Name'  :pd.Series(['Tom','Jerry','Spike', 'Popeye', 'Olive', 'Bluto', 'Mickey']),
     'Age'   :pd.Series([10,12,14,30,28,33,15]),
     'Height':pd.Series([3.25,1.11,4.12,5.47,6.15,6.67,2.61])}
df = pd.DataFrame(d)
df

Unnamed: 0,Name,Age,Height
0,Tom,10,3.25
1,Jerry,12,1.11
2,Spike,14,4.12
3,Popeye,30,5.47
4,Olive,28,6.15
5,Bluto,33,6.67
6,Mickey,15,2.61


#### 5. head() function

In [92]:
df.head()

Unnamed: 0,Name,Age,Height
0,Tom,10,3.25
1,Jerry,12,1.11
2,Spike,14,4.12
3,Popeye,30,5.47
4,Olive,28,6.15


#### 5. tail() function

In [94]:
df.tail(3)

Unnamed: 0,Name,Age,Height
4,Olive,28,6.15
5,Bluto,33,6.67
6,Mickey,15,2.61


#### 5. `empty()` function

In [97]:
df.empty

False

# i) Statistical or Mathematical Functions

Sum, Mean, Median, Mode, Variance, Min, Max, Standard Deviation.

In [98]:
data = {'one': pd.Series([1,2,3,4]),
        'two': pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four': pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


1. Sum

In [102]:
df.sum()

Unnamed: 0,0
one,10
two,100
three,1000
four,10000


In [101]:
df.sum(axis = 1)

Unnamed: 0,0
0,1111
1,2222
2,3333
3,4444


2. Mean

In [103]:
df.mean()

Unnamed: 0,0
one,2.5
two,25.0
three,250.0
four,2500.0


In [104]:
df.mean(axis = 1)

Unnamed: 0,0
0,277.75
1,555.5
2,833.25
3,1111.0


3. Median

In [105]:
df.median()

Unnamed: 0,0
one,2.5
two,25.0
three,250.0
four,2500.0


In [107]:
df.median(axis = 1)

Unnamed: 0,0
0,55.0
1,110.0
2,165.0
3,220.0


4. Mode

In [108]:
de = pd.DataFrame({'A': [1,2,3,4,4,4,4,5], 'B': [10,20,30,20,40,40,50,60]})
de

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30
3,4,20
4,4,40
5,4,40
6,4,50
7,5,60


In [110]:
de['A'].mode()

Unnamed: 0,A
0,4


In [111]:
de['B'].mode()

Unnamed: 0,B
0,20
1,40


5. Variance

In [112]:
df.var()

Unnamed: 0,0
one,1.666667
two,166.6667
three,16666.67
four,1666667.0


6. Min

In [113]:
df.min()

Unnamed: 0,0
one,1
two,10
three,100
four,1000


7. Max

In [114]:
df.max()

Unnamed: 0,0
one,4
two,40
three,400
four,4000


8. Standard Deviation

In [115]:
df.std()

Unnamed: 0,0
one,1.290994
two,12.909944
three,129.099445
four,1290.994449


# j) Describe Function

In [116]:
data = {'one'  : pd.Series([1,2,3,4]),
        'two'  : pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four' : pd.Series([1000,2000,3000,4000]),
        'five' : pd.Series(['A','B','C','D']) }
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four,five
0,1,10,100,1000,A
1,2,20,200,2000,B
2,3,30,300,3000,C
3,4,40,400,4000,D


In [117]:
df.describe()

Unnamed: 0,one,two,three,four
count,4.0,4.0,4.0,4.0
mean,2.5,25.0,250.0,2500.0
std,1.290994,12.909944,129.099445,1290.994449
min,1.0,10.0,100.0,1000.0
25%,1.75,17.5,175.0,1750.0
50%,2.5,25.0,250.0,2500.0
75%,3.25,32.5,325.0,3250.0
max,4.0,40.0,400.0,4000.0


# k) Pipe Functions

The `pipe()` method in Pandas DataFrame allows you to apply a function to the DataFrame, similar to the way the apply() method works. The difference the pipe() allows you to chain multiple operations together by passing the output of one function to the input of the next function.

In [119]:
data = {'one'  : pd.Series([1,2,3,4]),
        'two'  : pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four' : pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [120]:
def add_(i,j):
  return i+j

def sub_(i,j):
  return i-j

`Example 1`

In [121]:
df.pipe(add_, 10)

Unnamed: 0,one,two,three,four
0,11,20,110,1010
1,12,30,210,2010
2,13,40,310,3010
3,14,50,410,4010


`Example 2`

In [122]:
def mean_(col):
  return col.mean()

def square_(i):
  return i**2

df.pipe(mean_)

Unnamed: 0,0
one,2.5
two,25.0
three,250.0
four,2500.0


In [125]:
df.pipe(mean_).pipe(square_)   #finding the mean square

Unnamed: 0,0
one,6.25
two,625.0
three,62500.0
four,6250000.0


# j) `apply()` function

The apply() method in Pandas DataFrame allows you to apply a function to the DataFrame, either a buit-in Python function or a user defined function.