In [333]:
!pip install pandas



In [334]:
import pandas as pd
import numpy as np

#1. Working with Pandas Series

#### a) Creating Series

A Pandas Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floats, etc.). It is similar to a column in a spreadsheet or a database table, with labels (called the index) associated with each value, allowing for intuitive data alignment and access. Series are foundational in pandas, serving as the building blocks for more complex data structures like DataFrames.

**Series Through list**

In [335]:
pd.__version__

'2.2.2'

In [336]:
lst = [1,2,3,4,5]
print(pd.Series(lst))

0    1
1    2
2    3
3    4
4    5
dtype: int64


**Series through Numpy Array**

In [337]:
arr = np.array([1,2,3,4,5])

print(pd.Series(arr))


0    1
1    2
2    3
3    4
4    5
dtype: int64


**Giving Index from my end**

In [338]:
print(pd.Series(index = [1,2,3,4,5], data = ['Ainadri', 'Ana', 'Soumya', 'Subhankar', 'Ritesh']))

1      Ainadri
2          Ana
3       Soumya
4    Subhankar
5       Ritesh
dtype: object


**Series Through dictionary values**

In [339]:
steps = {'day_1': 4000, 'day_2': 4200,'day_3': 4400,'day_4': 4700}
print(pd.Series(steps))

day_1    4000
day_2    4200
day_3    4400
day_4    4700
dtype: int64


**Using `repeat()` function along with creating a series**

In [340]:
pd.Series(5).repeat(3)

Unnamed: 0,0
0,5
0,5
0,5


We can use the `reset()` function to make the index accurate

In [341]:
pd.Series(5).repeat(3).reset_index(drop = True)  #the drop=True is used to discard the old index after calling reset_index().



Unnamed: 0,0
0,5
1,5
2,5


In [342]:
s = pd.Series([10,20]).repeat([5,3]).reset_index(drop = True)
s

Unnamed: 0,0
0,10
1,10
2,10
3,10
4,10
5,20
6,20
7,20


**Accesing elements**

In [343]:
print(s[0])

10


In [344]:
print(s[-1:])

7    20
dtype: int64


#### b) Aggregate function on pandas Series

Aggregate functions on a Pandas Series perform summary computations, such as calculating statistics like sum, mean, median, min, max, count, and standard deviation.

In [345]:
sr = pd.Series([1,2,3,4,5,6,7])
print(sr.agg(['min', 'max', 'sum', 'mean', 'median','count']))

min        1.0
max        7.0
sum       28.0
mean       4.0
median     4.0
count      7.0
dtype: float64


#### c) Series absolute function

The absolute function on a Pandas Series, accessed using .abs(), returns a new Series with the absolute (non-negative) values of each element. It is commonly used to eliminate negative signs from numerical data, enabling easier comparison, analysis, or aggregation without regard to direction.



In [346]:
sr = pd.Series([1,-2,-3,-4,-5,6,7])
print(sr.abs())

0    1
1    2
2    3
3    4
4    5
5    6
6    7
dtype: int64


#### d) Concatenating Series

Appending in Pandas Series is the process of combining two or more Series objects into a single Series, typically done using the `pd.concat()`. It preserves the original indices unless reset, allowing for flexible data stacking.

Syntax: `pd.concat([series1, series2])`

In [347]:
sr1 = pd.Series([1,2,3,4,5,6,7])
sr2 = pd.Series([1,-2,-5,6,7])
print(pd.concat([sr1,sr2]).reset_index(drop = True))

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     1
8    -2
9    -5
10    6
11    7
dtype: int64


#### e) Astype function

The `.astype()` function in Pandas is used to convert the data type of a Series to a specified type, such as int, float, str, or even custom types.

Syntax: `series.astype(dtype)`

In [348]:
print(type(sr1[1]))

<class 'numpy.int64'>


In [349]:
sr2 = pd.Series([1,-2,-5,6,7])

sr2.astype('float')

Unnamed: 0,0
0,1.0
1,-2.0
2,-5.0
3,6.0
4,7.0


#### f) Between function
The `.between()` function in Pandas is used to check whether each element in a Series lies between two boundary values, inclusive by default. It returns a Boolean Series

Syntax: `series.between(lower, upper, inclusive='both')`

* `lower`: lower bound

* `upper`: upper bound

* `inclusive`: `'both'` (default), `'left'`, `'right'`, or `'neither'`

In [350]:
sr1 = pd.Series([1,2,3,4,5,9])
sr1.between(4,11)

Unnamed: 0,0
0,False
1,False
2,False
3,True
4,True
5,True


#### g) All string functions can be used to extract or modify texts in a series

* Upper and Lower Function
* Len Function
* Strip Function
* Split Function
* Contains Function
* Replace Function
* Count Function
* Stratswith and Endswith Function
* Find Function

In [351]:
ser = pd.Series(['  Eshant Das  ', '  Data Science  ', '  Hello World  ', '  Artificial Intelligence    ', '   Machine Learning  '])


Upper and Lower Function

In [352]:
print(ser.str.upper())

print('-' * 30 )

print(ser.str.lower())

0                     ESHANT DAS  
1                   DATA SCIENCE  
2                    HELLO WORLD  
3      ARTIFICIAL INTELLIGENCE    
4               MACHINE LEARNING  
dtype: object
------------------------------
0                     eshant das  
1                   data science  
2                    hello world  
3      artificial intelligence    
4               machine learning  
dtype: object


`len()`

In [353]:
for i in ser:
  print(i , len(i))


  Eshant Das   14
  Data Science   16
  Hello World   15
  Artificial Intelligence     29
   Machine Learning   21


`strip()`

In [354]:
print(ser)

0                     Eshant Das  
1                   Data Science  
2                    Hello World  
3      Artificial Intelligence    
4               Machine Learning  
dtype: object


In [355]:
print(ser.str.strip())

0                 Eshant Das
1               Data Science
2                Hello World
3    Artificial Intelligence
4           Machine Learning
dtype: object


`split()`

In [356]:
print(pd.Series(['10/3/1983', '10/4/1994', '29/10/1998']))

print('-' * 30 )

print(pd.Series(['10/3/1983', '10/4/1994', '29/10/1998']).str.split())

print('-' * 30 )

print(pd.Series(['10/3/1983', '10/4/1994', '29/10/1998']).str.split('/'))



0     10/3/1983
1     10/4/1994
2    29/10/1998
dtype: object
------------------------------
0     [10/3/1983]
1     [10/4/1994]
2    [29/10/1998]
dtype: object
------------------------------
0     [10, 3, 1983]
1     [10, 4, 1994]
2    [29, 10, 1998]
dtype: object


In [357]:
ser.str.split()

Unnamed: 0,0
0,"[Eshant, Das]"
1,"[Data, Science]"
2,"[Hello, World]"
3,"[Artificial, Intelligence]"
4,"[Machine, Learning]"


`contains()`

In [358]:
ser = pd.Series(['   Eshant Das', 'Data Science', 'Hello World', 'Artificial Intelligence', 'Machine Learning'])
ser.str.contains("i")

Unnamed: 0,0
0,False
1,True
2,False
3,True
4,True


`replace()`

In [359]:
ser.str.replace('i','*')

Unnamed: 0,0
0,Eshant Das
1,Data Sc*ence
2,Hello World
3,Art*f*c*al Intell*gence
4,Mach*ne Learn*ng


`count()`

In [360]:
ser.str.count('a')

Unnamed: 0,0
0,2
1,2
2,0
3,1
4,2


`startswith()` and `endswith()`

In [361]:
ser.str.endswith('ce')

Unnamed: 0,0
0,False
1,True
2,False
3,True
4,False


In [362]:
ser.str.startswith('A')

Unnamed: 0,0
0,False
1,False
2,False
3,True
4,False


`find()`

In [363]:
ser = pd.Series(['   Eshant Das', 'Data Science', 'Hello World', 'Artificial Intelligence', 'Machine Learning'])

ser.str.find('a')

Unnamed: 0,0
0,6
1,1
2,-1
3,8
4,1


#### h) Converting a Series to List

In [364]:
ser.to_list()

['   Eshant Das',
 'Data Science',
 'Hello World',
 'Artificial Intelligence',
 'Machine Learning']

# 2. Pandas Dataframe

#### a) Creating Data Frames

Creating a dataframe using a list

In [365]:
lst = ['Ainadri', 'Mandal', 'Subhankar', 'Saha', 'Soumya', 'Panda']

pd.DataFrame(lst)

Unnamed: 0,0
0,Ainadri
1,Mandal
2,Subhankar
3,Saha
4,Soumya
5,Panda


In [366]:
lst = [['tom', 10],['jerry', 12], ['spike', 14]]
pd.DataFrame(lst)

Unnamed: 0,0,1
0,tom,10
1,jerry,12
2,spike,14


Creating DataFrame from dict of ndarray/lists:

In [367]:
data = {'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28, 34, 29, 42]}
pd.DataFrame(data)

Unnamed: 0,Name,Age
0,Tom,28
1,Jack,34
2,Steve,29
3,Ricky,42


How to make it multidimensional

In [368]:
data = {'Name': ['Tom', 'Jack', 'Steve', 'Ricky'],
        'Age': [28, 34, 29, 42],
        'Gender': ['M', 'M', 'M', 'F'],
        'City': ['Delhi', 'Mumbai', 'Goa', 'Kerala'],
        'Qualification': ['MSc', 'MA', 'MCA', 'Phd']}
df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Gender,City,Qualification
0,Tom,28,M,Delhi,MSc
1,Jack,34,M,Mumbai,MA
2,Steve,29,M,Goa,MCA
3,Ricky,42,F,Kerala,Phd


In [369]:
df[['Name', 'Age', 'City']]

Unnamed: 0,Name,Age,City
0,Tom,28,Delhi
1,Jack,34,Mumbai
2,Steve,29,Goa
3,Ricky,42,Kerala


#### **b) Slicing in DataFrames Using iloc and loc**

In [370]:
data = {'one': pd.Series([1,2,3,4]),
        'two': pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four': pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df.index = ['a', 'b', 'c', 'd']
df

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


#### **Basic `loc[]` Operations**

`.loc[]` is used to get or change data in a DataFrame using the row and column names (labels), not numbers.

 **Syntax** : `df.loc[row_label, column_label]`

In [371]:
df.loc['a':'c']

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000


In [372]:
df.loc['a':]

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


In [373]:
df.loc['a': 'c', 'one']

Unnamed: 0,one
a,1
b,2
c,3


In [374]:
df.loc['a': , ['one','three']]

Unnamed: 0,one,three
a,1,100
b,2,200
c,3,300
d,4,400


In [375]:
df.loc['a': , 'one': 'three']

Unnamed: 0,one,two,three
a,1,10,100
b,2,20,200
c,3,30,300
d,4,40,400


#### **Basic `iloc[]` Operations**

`.loc[]` is used to get or change data in a DataFrame using the row and column names (labels), not numbers.

**Syntax**: `DataFrame.iloc[row_position, column_position]`

**Note** : last index is inclusive unlike `loc[]`

In [376]:
df

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


In [377]:
df.iloc[1:2]

Unnamed: 0,one,two,three,four
b,2,20,200,2000


In [378]:
df.iloc[1: , 2: ]

Unnamed: 0,three,four
b,200,2000
c,300,3000
d,400,4000


In [379]:
df.iloc[ : , 2:3]

Unnamed: 0,three
a,100
b,200
c,300
d,400


In [380]:
df.iloc[[0,2]]

Unnamed: 0,one,two,three,four
a,1,10,100,1000
c,3,30,300,3000


In [381]:
df.iloc[[0,2],[0,2]]

Unnamed: 0,one,three
a,1,100
c,3,300


#### **c) Slicing Using conditions**



In [382]:
df['two'] > 20

Unnamed: 0,two
a,False
b,False
c,True
d,True


In [383]:
df[df['two']>20]

Unnamed: 0,one,two,three,four
c,3,30,300,3000
d,4,40,400,4000


In [384]:
df.loc[df['two']> 20, ['three', 'four']]

Unnamed: 0,three,four
c,300,3000
d,400,4000


In [385]:
df

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


In [386]:
df.loc[df['three'] > 200, ['one', 'three', 'four']]

Unnamed: 0,one,three,four
c,3,300,3000
d,4,400,4000


In [387]:
df.loc[(df['three'] > 200) & (df['two']>30)]

Unnamed: 0,one,two,three,four
d,4,40,400,4000


#### **c) Adding new Column in DataFrame**

In [388]:
df

Unnamed: 0,one,two,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000


**Using a list**

In [389]:
# As we need four values for the new column

l = [12, 34, 56, 78]
df['five'] = l

df

Unnamed: 0,one,two,three,four,five
a,1,10,100,1000,12
b,2,20,200,2000,34
c,3,30,300,3000,56
d,4,40,400,4000,78


**Using a series**

In [390]:
sr = pd.Series([111,222,333,444])

df['six'] = sr.values

df

Unnamed: 0,one,two,three,four,five,six
a,1,10,100,1000,12,111
b,2,20,200,2000,34,222
c,3,30,300,3000,56,333
d,4,40,400,4000,78,444


**Using an existing Column**

Like incrementing values of column by 10

In [391]:
df

Unnamed: 0,one,two,three,four,five,six
a,1,10,100,1000,12,111
b,2,20,200,2000,34,222
c,3,30,300,3000,56,333
d,4,40,400,4000,78,444


In [392]:
df['seven'] = df['two'] + 10

df

Unnamed: 0,one,two,three,four,five,six,seven
a,1,10,100,1000,12,111,20
b,2,20,200,2000,34,222,30
c,3,30,300,3000,56,333,40
d,4,40,400,4000,78,444,50


#### **d) Column Deletion**

Two ways:


*   Using del





In [393]:
del df['six']

In [394]:
df

Unnamed: 0,one,two,three,four,five,seven
a,1,10,100,1000,12,20
b,2,20,200,2000,34,30
c,3,30,300,3000,56,40
d,4,40,400,4000,78,50


*   Using pop

In [395]:
df.pop('seven')

Unnamed: 0,seven
a,20
b,30
c,40
d,50


In [396]:
df

Unnamed: 0,one,two,three,four,five
a,1,10,100,1000,12
b,2,20,200,2000,34
c,3,30,300,3000,56
d,4,40,400,4000,78


#### **e) Addition of rows**

We can add rows in DataFrame using concat method.

In [397]:
df1 = pd.DataFrame([[1,2],[3,4]], columns = ['a','b'])
df2 = pd.DataFrame([[5,6],[7,8]], columns = ['a','b'])

df = pd.concat([df1, df2]).reset_index(drop = True)
df

Unnamed: 0,a,b
0,1,2
1,3,4
2,5,6
3,7,8


#### **f) Pandas Drop Function**

In [398]:
data = {'one': pd.Series([1,2,3,4]),
        'two': pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four': pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


* **axis = 0 => Rows (row- wise)**

In [399]:
df.drop([0,1], axis = 0, inplace = True)
df
#inplace is used because it makes the change in df otherwise no change will be seen

Unnamed: 0,one,two,three,four
2,3,30,300,3000
3,4,40,400,4000


* **axis = 1 => Columns (column- wise)**

In [400]:
df.drop(['one','three'], axis = 1, inplace = True)
df

Unnamed: 0,two,four
2,30,3000
3,40,4000


#### **g) Transposing a data frame**

In [401]:
data = {'one': pd.Series([1,2,3,4]),
        'two': pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four': pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [402]:
df.T

Unnamed: 0,0,1,2,3
one,1,2,3,4
two,10,20,30,40
three,100,200,300,400
four,1000,2000,3000,4000


#### **h) A set for more DataFrame functionalities.**

In [403]:
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


##### 1. axes function

The `.axes` attribute in a Pandas DataFrame **returns** a list with the **row** and **column labels** of the DataFrame. The first element of the list is the row labels (index), and the second element is the column labels.

In [404]:
df.axes

[RangeIndex(start=0, stop=4, step=1),
 Index(['one', 'two', 'three', 'four'], dtype='object')]

##### 2. ndim Function

The `.ndim` attribute in a Pandas DataFrame returns the number of dimensions of the dataframe, which is always 2 for DataFrame (row-column-format).

In [405]:
df.ndim

2

##### 3. `dtypes`

In [406]:
df.dtypes

Unnamed: 0,0
one,int64
two,int64
three,int64
four,int64


##### 4. shape function

In [407]:
df.shape

(4, 4)

In [408]:
d = {'Name'  :pd.Series(['Tom','Jerry','Spike', 'Popeye', 'Olive', 'Bluto', 'Mickey']),
     'Age'   :pd.Series([10,12,14,30,28,33,15]),
     'Height':pd.Series([3.25,1.11,4.12,5.47,6.15,6.67,2.61])}
df = pd.DataFrame(d)
df

Unnamed: 0,Name,Age,Height
0,Tom,10,3.25
1,Jerry,12,1.11
2,Spike,14,4.12
3,Popeye,30,5.47
4,Olive,28,6.15
5,Bluto,33,6.67
6,Mickey,15,2.61


##### 5. head() function

In [409]:
df.head()

Unnamed: 0,Name,Age,Height
0,Tom,10,3.25
1,Jerry,12,1.11
2,Spike,14,4.12
3,Popeye,30,5.47
4,Olive,28,6.15


##### 5. tail() function

In [410]:
df.tail(3)

Unnamed: 0,Name,Age,Height
4,Olive,28,6.15
5,Bluto,33,6.67
6,Mickey,15,2.61


##### 5. `empty()` function

In [411]:
df.empty

False

#### i) Statistical or Mathematical Functions

Sum, Mean, Median, Mode, Variance, Min, Max, Standard Deviation.

In [412]:
data = {'one': pd.Series([1,2,3,4]),
        'two': pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four': pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


1. Sum

In [413]:
df.sum()

Unnamed: 0,0
one,10
two,100
three,1000
four,10000


In [414]:
df.sum(axis = 1)

Unnamed: 0,0
0,1111
1,2222
2,3333
3,4444


2. Mean

In [415]:
df.mean()

Unnamed: 0,0
one,2.5
two,25.0
three,250.0
four,2500.0


In [416]:
df.mean(axis = 1)

Unnamed: 0,0
0,277.75
1,555.5
2,833.25
3,1111.0


3. Median

In [417]:
df.median()

Unnamed: 0,0
one,2.5
two,25.0
three,250.0
four,2500.0


In [418]:
df.median(axis = 1)

Unnamed: 0,0
0,55.0
1,110.0
2,165.0
3,220.0


4. Mode

In [419]:
de = pd.DataFrame({'A': [1,2,3,4,4,4,4,5], 'B': [10,20,30,20,40,40,50,60]})
de

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30
3,4,20
4,4,40
5,4,40
6,4,50
7,5,60


In [420]:
de['A'].mode()

Unnamed: 0,A
0,4


In [421]:
de['B'].mode()

Unnamed: 0,B
0,20
1,40


5. Variance

In [422]:
df.var()

Unnamed: 0,0
one,1.666667
two,166.6667
three,16666.67
four,1666667.0


6. Min

In [423]:
df.min()

Unnamed: 0,0
one,1
two,10
three,100
four,1000


7. Max

In [424]:
df.max()

Unnamed: 0,0
one,4
two,40
three,400
four,4000


8. Standard Deviation

In [425]:
df.std()

Unnamed: 0,0
one,1.290994
two,12.909944
three,129.099445
four,1290.994449


#### j) Describe Function

In [426]:
data = {'one'  : pd.Series([1,2,3,4]),
        'two'  : pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four' : pd.Series([1000,2000,3000,4000]),
        'five' : pd.Series(['A','B','C','D']) }
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four,five
0,1,10,100,1000,A
1,2,20,200,2000,B
2,3,30,300,3000,C
3,4,40,400,4000,D


In [427]:
df.describe()

Unnamed: 0,one,two,three,four
count,4.0,4.0,4.0,4.0
mean,2.5,25.0,250.0,2500.0
std,1.290994,12.909944,129.099445,1290.994449
min,1.0,10.0,100.0,1000.0
25%,1.75,17.5,175.0,1750.0
50%,2.5,25.0,250.0,2500.0
75%,3.25,32.5,325.0,3250.0
max,4.0,40.0,400.0,4000.0


#### k) Pipe Functions

The `pipe()` method in Pandas DataFrame allows you to apply a function to the DataFrame, similar to the way the apply() method works. The difference the pipe() allows you to chain multiple operations together by passing the output of one function to the input of the next function.

In [428]:
data = {'one'  : pd.Series([1,2,3,4]),
        'two'  : pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four' : pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [429]:
def add_(i,j):
  return i+j

def sub_(i,j):
  return i-j

`Example 1`

In [430]:
df.pipe(add_, 10)

Unnamed: 0,one,two,three,four
0,11,20,110,1010
1,12,30,210,2010
2,13,40,310,3010
3,14,50,410,4010


`Example 2`

In [431]:
def mean_(col):
  return col.mean()

def square_(i):
  return i**2

df.pipe(mean_)

Unnamed: 0,0
one,2.5
two,25.0
three,250.0
four,2500.0


In [432]:
df.pipe(mean_).pipe(square_)   #finding the mean square

Unnamed: 0,0
one,6.25
two,625.0
three,62500.0
four,6250000.0


#### j) `apply()` function

The apply() method in Pandas DataFrame allows you to apply a function to the DataFrame, either a buit-in Python function or a user defined function.

In [433]:
data = {'one'  : pd.Series([1,2,3,4]),
        'two'  : pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four' : pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [434]:
print(df.apply(np.mean))

print('-'* 50)

print(df.apply(np.max))

one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64
--------------------------------------------------
one         4
two        40
three     400
four     4000
dtype: int64


In [435]:
df.apply(lambda x: x.max() - x.min())

Unnamed: 0,0
one,3
two,30
three,300
four,3000


#### 3. `map()` Function

It is used on a Series to transform each value one by one.

**Key points:**

* Works element-wise (one value at a time)

* Can take a function, dict, or Series as the mapping

* Only for Series (not DataFrames directly)

In [436]:
s = pd.Series([1, 2, 3])
s.map(lambda x: x**2)

Unnamed: 0,0
0,1
1,4
2,9


#### 4. `applymap()` function

***This function is deprecated instead we are using `map()` function***

In [437]:
df.applymap(lambda x : x*100)

  df.applymap(lambda x : x*100)


Unnamed: 0,one,two,three,four
0,100,1000,10000,100000
1,200,2000,20000,200000
2,300,3000,30000,300000
3,400,4000,40000,400000


In [438]:
df.map(lambda x : x*100)

Unnamed: 0,one,two,three,four
0,100,1000,10000,100000
1,200,2000,20000,200000
2,300,3000,30000,300000
3,400,4000,40000,400000


In [439]:
df.map(np.float128)

Unnamed: 0,one,two,three,four
0,1.0,10.0,100.0,1000.0
1,2.0,20.0,200.0,2000.0
2,3.0,30.0,300.0,3000.0
3,4.0,40.0,400.0,4000.0


#### l) Reindex Function

`reindex()` changes the row labels (index) and/or column labels of a DataFrame or Series to a new set of labels.

Key points:

* New labels not in old → filled with NaN

* Old labels not in new → dropped

* Useful for aligning data to a specific structure

In [440]:
data = {'one'  : pd.Series([1,2,3,4]),
        'two'  : pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four' : pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [441]:
df.reindex([0,1,2,3,4])

Unnamed: 0,one,two,three,four
0,1.0,10.0,100.0,1000.0
1,2.0,20.0,200.0,2000.0
2,3.0,30.0,300.0,3000.0
3,4.0,40.0,400.0,4000.0
4,,,,


In [442]:
data = {'Name': ['John', 'Jane', 'Jin', 'Joan'],
        'Age' : [25,30,35,40],
        'City': ['New York', 'Paris', 'Berlin', 'London']}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Jane,30,Paris
2,Jin,35,Berlin
3,Joan,40,London


In [443]:
df.reindex(columns = ['City', 'Name', 'Age'])

Unnamed: 0,City,Name,Age
0,New York,John,25
1,Paris,Jane,30
2,Berlin,Jin,35
3,London,Joan,40


In [444]:
df.reindex([0,1,2,3,4],columns = ['City', 'Name', 'Age'])

Unnamed: 0,City,Name,Age
0,New York,John,25.0
1,Paris,Jane,30.0
2,Berlin,Jin,35.0
3,London,Joan,40.0
4,,,


#### m) Renaming the columns in Pandas DataFrame

In [445]:
data = {'one'  : pd.Series([1,2,3,4]),
        'two'  : pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,300,400]),
        'four' : pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


In [446]:
df.rename(columns = {'one': 'A', 'two': 'B', 'three': 'C', 'four': 'D'})

Unnamed: 0,A,B,C,D
0,1,10,100,1000
1,2,20,200,2000
2,3,30,300,3000
3,4,40,400,4000


#### n) Sorting in Pandas

In [447]:
data = {'one'  : pd.Series([41,51,31,11]),
        'two'  : pd.Series([10,20,30,40]),
        'three': pd.Series([100,200,500,400]),
        'four' : pd.Series([1000,2000,3000,4000])}
df = pd.DataFrame(data)
df

Unnamed: 0,one,two,three,four
0,41,10,100,1000
1,51,20,200,2000
2,31,30,500,3000
3,11,40,400,4000


Sorting in specific respected columns


In [448]:
df.sort_values(by = 'one')

Unnamed: 0,one,two,three,four
3,11,40,400,4000
2,31,30,500,3000
0,41,10,100,1000
1,51,20,200,2000


Sorting in a specific order

In [449]:
df.sort_values(by = 'one', ascending = False)

Unnamed: 0,one,two,three,four
1,51,20,200,2000
0,41,10,100,1000
2,31,30,500,3000
3,11,40,400,4000


Sorting in a specific order based on multiple columns



In [450]:
df.sort_values(by = ['one', 'three'])

Unnamed: 0,one,two,three,four
3,11,40,400,4000
2,31,30,500,3000
0,41,10,100,1000
1,51,20,200,2000


Sort with specific Sorting Algorithm:

* quicksort
* mergesort
* heapsort

In [451]:
df.sort_values(by = ['one'], kind = 'heapsort')

Unnamed: 0,one,two,three,four
3,11,40,400,4000
2,31,30,500,3000
0,41,10,100,1000
1,51,20,200,2000


#### o) Groupby Functions

In Pandas, `groupby()` groups your data by one or more columns (or index levels) so you can run calculations within each group separately.

In [452]:
cricket = {'Team':   ['India', 'India', 'Australia', 'Australia', 'SA', 'SA', 'SA', 'SA', 'NZ', 'NZ', 'NZ', 'India'],
           'Rank':   [2,3,1,2,3,4,1,1,2,4,1,2],
           'Year':   [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
           'Points': [876,801,891,815,776,784,834,824,758,691,883,782]}
df = pd.DataFrame(cricket)
df

Unnamed: 0,Team,Rank,Year,Points
0,India,2,2014,876
1,India,3,2015,801
2,Australia,1,2014,891
3,Australia,2,2015,815
4,SA,3,2014,776
5,SA,4,2015,784
6,SA,1,2016,834
7,SA,1,2017,824
8,NZ,2,2016,758
9,NZ,4,2014,691


In [453]:
df.groupby('Team').groups

{'Australia': [2, 3], 'India': [0, 1, 11], 'NZ': [8, 9, 10], 'SA': [4, 5, 6, 7]}

To seacrh for specific Country with Specific year

In [454]:
df.groupby(['Team', 'Year']).get_group(('Australia',2014))

Unnamed: 0,Team,Rank,Year,Points
2,Australia,1,2014,891


Adding statistical computation on top of groupby

In [455]:
df.groupby('Team')['Points'].sum()

Unnamed: 0_level_0,Points
Team,Unnamed: 1_level_1
Australia,1706
India,2459
NZ,2332
SA,3218


In [456]:
df.groupby('Team').sum()['Points']

Unnamed: 0_level_0,Points
Team,Unnamed: 1_level_1
Australia,1706
India,2459
NZ,2332
SA,3218


In [457]:
df.groupby('Team').sum()['Points'].sort_values(ascending = False)

Unnamed: 0_level_0,Points
Team,Unnamed: 1_level_1
SA,3218
India,2459
NZ,2332
Australia,1706


Checking multiple stats for points team wise

In [458]:
groups = df.groupby('Team')

groups['Points'].agg(['sum', 'mean', 'max', 'min', 'std', 'median'])

Unnamed: 0_level_0,sum,mean,max,min,std,median
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Australia,1706,853.0,891,815,53.740115,853.0
India,2459,819.666667,876,782,49.702448,801.0
NZ,2332,777.333333,883,691,97.449132,758.0
SA,3218,804.5,834,776,28.769196,804.0


filter function along with groupby

In [459]:
df.groupby('Team').filter(lambda x: len(x) == 3)

Unnamed: 0,Team,Rank,Year,Points
0,India,2,2014,876
1,India,3,2015,801
8,NZ,2,2016,758
9,NZ,4,2014,691
10,NZ,1,2015,883
11,India,2,2017,782


# 3. Working with csv Files and Basic data Analysis Using Pandas

#### a) Reading csv

In [460]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Pandas Practice/data/Football.csv')
df

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.40,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,Netherlands,Eredivisie,(UTR),Gyrano Kerk,24,0,2155,10,7.49,0.33,50,18,2.20,0.79,2020
656,Netherlands,Eredivisie,(AJA),Quincy Promes,18,2,1573,12,9.77,0.59,56,30,3.38,1.81,2020
657,Netherlands,Eredivisie,(PSV),Denzel Dumfries,25,0,2363,7,5.72,0.23,45,14,1.81,0.56,2020
658,Netherlands,Eredivisie,,Cyriel Dessers,26,0,2461,15,14.51,0.56,84,43,3.24,1.66,2020


In [461]:
df.head()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016
3,Spain,La Liga,(CAR),Ruben Castro,32,3,2842,13,14.06,0.47,117,42,3.91,1.4,2016
4,Spain,La Liga,(VAL),Kevin Gameiro,21,10,1745,13,10.65,0.58,50,23,2.72,1.25,2016


#### b) Pandas info function

In [462]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Country                  660 non-null    object 
 1   League                   660 non-null    object 
 2   Club                     626 non-null    object 
 3   Player Names             660 non-null    object 
 4   Matches_Played           660 non-null    int64  
 5   Substitution             660 non-null    int64  
 6   Mins                     660 non-null    int64  
 7   Goals                    660 non-null    int64  
 8   xG                       660 non-null    float64
 9   xG Per Avg Match         660 non-null    float64
 10  Shots                    660 non-null    int64  
 11  OnTarget                 660 non-null    int64  
 12  Shots Per Avg Match      660 non-null    float64
 13  On Target Per Avg Match  660 non-null    float64
 14  Year                     6

In [463]:
df.isnull()

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
656,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
657,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
658,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False


In [464]:
df.isnull().sum()

Unnamed: 0,0
Country,0
League,0
Club,34
Player Names,0
Matches_Played,0
Substitution,0
Mins,0
Goals,0
xG,0
xG Per Avg Match,0


#### d) Quantile function to get the specific percentile value

In [465]:
df.describe()

Unnamed: 0,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,22.371212,3.224242,2071.416667,11.810606,10.089606,0.476167,64.177273,28.365152,2.948015,1.315652,2018.363636
std,9.754658,3.839498,900.595049,6.075315,5.724844,0.192831,34.941622,16.363149,0.914906,0.474239,1.3677
min,2.0,0.0,264.0,2.0,0.71,0.07,5.0,2.0,0.8,0.24,2016.0
25%,14.0,0.0,1363.5,8.0,6.1,0.34,37.75,17.0,2.335,0.98,2017.0
50%,24.0,2.0,2245.5,11.0,9.285,0.435,62.0,26.0,2.845,1.25,2019.0
75%,31.0,5.0,2822.0,14.0,13.2525,0.57,86.0,37.0,3.3825,1.54,2019.0
max,38.0,26.0,4177.0,42.0,32.54,1.35,208.0,102.0,7.2,3.63,2020.0


In [466]:
df.describe(percentiles = [.80])

Unnamed: 0,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
count,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0,660.0
mean,22.371212,3.224242,2071.416667,11.810606,10.089606,0.476167,64.177273,28.365152,2.948015,1.315652,2018.363636
std,9.754658,3.839498,900.595049,6.075315,5.724844,0.192831,34.941622,16.363149,0.914906,0.474239,1.3677
min,2.0,0.0,264.0,2.0,0.71,0.07,5.0,2.0,0.8,0.24,2016.0
50%,24.0,2.0,2245.5,11.0,9.285,0.435,62.0,26.0,2.845,1.25,2019.0
80%,32.0,6.0,2915.8,15.0,14.076,0.61,90.0,39.0,3.6,1.63,2020.0
max,38.0,26.0,4177.0,42.0,32.54,1.35,208.0,102.0,7.2,3.63,2020.0


**Lets use the quantile function to get the exact value**

In [467]:
print(df['Mins'].quantile(.80))

2915.8


#### **e) Copy Function**

In [468]:
de = df.copy()
df.head(3)

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016


In [469]:
de['Year + 100'] = de['Year']+100
de.head(3)

Unnamed: 0,Country,League,Club,Player Names,Matches_Played,Substitution,Mins,Goals,xG,xG Per Avg Match,Shots,OnTarget,Shots Per Avg Match,On Target Per Avg Match,Year,Year + 100
0,Spain,La Liga,(BET),Juanmi Callejon,19,16,1849,11,6.62,0.34,48,20,2.47,1.03,2016,2116
1,Spain,La Liga,(BAR),Antoine Griezmann,36,0,3129,16,11.86,0.36,88,41,2.67,1.24,2016,2116
2,Spain,La Liga,(ATL),Luis Suarez,34,1,2940,28,23.21,0.75,120,57,3.88,1.84,2016,2116


#### f) Value Counts function

In [470]:
df['Player Names'].value_counts()

Unnamed: 0_level_0,count
Player Names,Unnamed: 1_level_1
Lionel Messi,5
Luis Suarez,5
Fabio Quagliarella,5
Andrea Belotti,5
Robert Lewandowski,5
...,...
Robson,1
Renato Kayzer,1
Donny van de Beek,1
Teun Koopmeiners,1


#### g) Unique and Nunique Function

In [471]:
df['Player Names'].unique()

array(['Juanmi Callejon', 'Antoine Griezmann', 'Luis Suarez',
       'Ruben Castro', 'Kevin Gameiro', 'Cristiano Ronaldo',
       'Karim Benzema', 'Neymar ', 'Iago Aspas', 'Sergi Enrich',
       'Aduriz ', 'Sandro Ramlrez', 'Lionel Messi', 'Gerard Moreno',
       'Morata', 'Wissam Ben Yedder', 'Willian Jose', 'Andone ',
       'Cedric Bakambu', 'Isco', 'Mohamed Salah', 'Gregoire Defrel',
       'Ciro Immobile', 'Nikola Kalinic', 'Dries Mertens',
       'Alejandro Gomez', 'Jose CallejOn', 'Iago Falque',
       'Giovanni Simeone', 'Mauro Icardi', 'Diego Falcinelli',
       'Cyril Thereau', 'Edin Dzeko', 'Lorenzo Insigne',
       'Fabio Quagliarella', 'Borriello ', 'Carlos Bacca',
       'Gonzalo Higuain', 'Keita Balde', 'Andrea Belotti', 'Fin Bartels',
       'Lars Stindl', 'Serge Gnabry', 'Wagner ', 'Andrej Kramaric',
       'Florian Niederlechner', 'Robert Lewandowski', 'Emil Forsberg',
       'Timo Werner', 'Nils Petersen', 'Vedad Ibisevic', 'Mario Gomez',
       'Maximilian Philipp',

In [472]:
df['Player Names'].nunique()

444

#### h) `dropna()` function

To drop the NaN values in the Pandas DataFrame this function is used to drop the rows and columns after analysing

**Syntax :** `DataFrameName.drop(axis = 0, inplace = False)`

In [473]:
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/3.%20Data%20Preprocessing%20-%20Removing%20Null%20Value%20Rows/googleplaystore.csv'

df = pd.read_csv(link)

df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [474]:
df.isnull().sum()

Unnamed: 0,0
App,0
Category,0
Rating,1474
Reviews,0
Size,0
Installs,0
Type,1
Price,0
Content Rating,1
Genres,0


In [475]:
df.dropna(inplace = True, axis = 0)

In [476]:
df.dropna(inplace = True, axis = 1)

In [477]:
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/3.%20Data%20Preprocessing%20-%20Removing%20Null%20Value%20Rows/googleplaystore.csv'

df = pd.read_csv(link)

df.isnull().sum()

Unnamed: 0,0
App,0
Category,0
Rating,1474
Reviews,0
Size,0
Installs,0
Type,1
Price,0
Content Rating,1
Genres,0


**Replacing the null values**

##### **`Numeric Columns`**

so to replace the null values we
* first find the mean of the particular column.
* Then we round of the value to two decimals
* then we put this value in place of the null places.

In [478]:
print(round(df['Rating'].mean(),2))

4.19


In [479]:
miss = round(df['Rating'].mean(),2)
df['Rating'] = df['Rating'].fillna(miss)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


##### **`Categorical Columns`**



In [480]:
df.isnull().sum()

Unnamed: 0,0
App,0
Category,0
Rating,0
Reviews,0
Size,0
Installs,0
Type,1
Price,0
Content Rating,1
Genres,0


In [486]:
df['Current Ver'] = df['Current Ver'].fillna('Varies on device')

In [487]:
df.isnull().sum()

Unnamed: 0,0
App,0
Category,0
Rating,0
Reviews,0
Size,0
Installs,0
Type,1
Price,0
Content Rating,1
Genres,0


#### j) sample function

`sample()` function isused to generate a random sample of rows and columns from the DataFrame.

**Syntax:** `DataFrame.sample(n = None, replace = False, weights = None, random_state = None , axis = None)`