In [2]:
#pandas piplelines 

# ------------------------------------------------------------------------------
# Pandas Series Overview
# ------------------------------------------------------------------------------
# A Pandas Series is a one-dimensional labeled array capable of holding any data type.
# It is similar to a column in a spreadsheet or database table.
#
# Example:
# import pandas as pd
# s = pd.Series([100, 200, 300], index=['a', 'b', 'c'])
#
# Features:
# - Labeled Indexing:
#     You can access elements by custom labels (e.g., s['b'] gives 200).
# - Vectorized Operations:
#     Supports operations like s + 10 or s * 2 applied to each element.
# - Data Types:
#     Can hold integers, floats, strings, or even Python objects.
# - Dictionary Input:
#     Series can be created from a dictionary, where keys become index labels.
#     Example: pd.Series({'a': 10, 'b': 20})
#
# Use Cases:
# - When you need a single labeled column of data.
# - Acts as a building block for DataFrames (which are 2D structures).
# ------------------------------------------------------------------------------



In [3]:
#Creating Series using list

import pandas as pd

pd.Series(['Male','Female','Male','Male','Female','Female','Female'])


0      Male
1    Female
2      Male
3      Male
4    Female
5    Female
6    Female
dtype: object

In [4]:
#creating list using array
import numpy as np
ser=pd.Series(np.array(['Male','Female','Male','Male','Female','Female','Female']))
print(type(ser))
print(ser)

<class 'pandas.core.series.Series'>
0      Male
1    Female
2      Male
3      Male
4    Female
5    Female
6    Female
dtype: object


In [5]:
#creating series using dictionary

#key is used as the index and value=value
dict_data={'a':1,'b':2,'c':3}
ser1=pd.Series(dict_data)
print(ser1)
print()
print(ser1.index)
print(ser1.values)

a    1
b    2
c    3
dtype: int64

Index(['a', 'b', 'c'], dtype='object')
[1 2 3]


<h1>Dataframe

In [6]:
# ------------------------------------------------------------------------------
# Pandas DataFrame Overview
# ------------------------------------------------------------------------------
# A Pandas DataFrame is a 2-dimensional, labeled data structure similar to a table,
# spreadsheet, or SQL result set.
#
# Structure:
# - It consists of rows and columns.
# - Each column is essentially a Pandas Series.
# - Columns can hold different data types (int, float, string, etc.).
#
# Example:
# import pandas as pd
# data = {
#     'Name': ['Alice', 'Bob', 'Charlie'],
#     'Age': [25, 30, 22],
#     'City': ['New York', 'London', 'Paris']
# }
# df = pd.DataFrame(data)
#
# Common Operations:
# - df['Age']           -> Access a column (returns a Series)
# - df.loc[1]           -> Access row by label
# - df.iloc[1]          -> Access row by position
# - df['Age'].mean()    -> Calculate average of Age column
#
# Use Cases:
# - Storing and analyzing structured data
# - Importing/exporting data from CSV, Excel, SQL, etc.
# - Performing data cleaning, filtering, aggregation, and visualization
# ------------------------------------------------------------------------------


In [7]:
df=pd.DataFrame({'Name':["alyster","alvin","alex","tanay"],
              'Age':[23,24,25,26],
              'Sex': ["MALE","MALE","MALE","MALE"]})
print(df)

      Name  Age   Sex
0  alyster   23  MALE
1    alvin   24  MALE
2     alex   25  MALE
3    tanay   26  MALE


In [8]:
#extracting single column from the dataframe(i.e printing series)
print(df['Name'])
print()
print(df['Name'][2])

0    alyster
1      alvin
2       alex
3      tanay
Name: Name, dtype: object

alex


In [9]:
#dataframe when column name is not specified
#since column name is not specified , it apprears as Range Index object
pd.DataFrame([[123,456,789],[213,565,852],['akl','bsg','cfg']])

Unnamed: 0,0,1,2
0,123,456,789
1,213,565,852
2,akl,bsg,cfg


#series operations

In [10]:
eyes=pd.Series({'hazel':200,'blue':45,'green':789})
print(eyes)


hazel    200
blue      45
green    789
dtype: int64


In [11]:
#attributes of Series object
print(eyes.index)
eyes.name='eye_color'
print(eyes.name)


print(eyes.values)

print()
print(eyes.sort_values)
print(eyes.sort_index)

print(eyes.unique())
print(eyes.nunique())


Index(['hazel', 'blue', 'green'], dtype='object')
eye_color
[200  45 789]

<bound method Series.sort_values of hazel    200
blue      45
green    789
Name: eye_color, dtype: int64>
<bound method Series.sort_index of hazel    200
blue      45
green    789
Name: eye_color, dtype: int64>
[200  45 789]
3


In [12]:
#duplicate values

my_data2=[220,215,93,64,64]
eye2=pd.Series(data=my_data2,index=['brown','blue','blue','hazel','green'])
print(eye2)

brown    220
blue     215
blue      93
hazel     64
green     64
dtype: int64


In [13]:
eye2.unique()

array([220, 215,  93,  64], dtype=int64)

In [14]:
eye2.value_counts()

64     2
220    1
215    1
93     1
Name: count, dtype: int64

In [15]:
eye2.index

Index(['brown', 'blue', 'blue', 'hazel', 'green'], dtype='object')

In [16]:
eye2.nunique()

4

In [17]:
#Series can be indexed and sliced like numpy arrays
ser3=pd.Series([1,5,9,8,7,4,6,3,5,7,8],index=['a','b','c','d','e','f','g','h','i','j','k'])


print(ser3['d']) #value at index d
print()
print(ser3[1:4]) #values from index 1 to 4

8

b    5
c    9
d    8
dtype: int64


In [18]:
#Series Operations:
# - Arithmetic operations (addition, subtraction, etc.) are vectorized.
# - You can perform operations on Series with different lengths.
import pandas as pd

# Create a Series with custom index
s = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])

# Display original series
print("Original Series:")
print(s)

# Accessing elements
print("\nElement at index 'c':", s['c'])

# Vectorized arithmetic operation (add 5 to each element)
print("\nSeries after adding 5:")
print(s + 5)

# Filtering elements greater than 25
print("\nElements greater than 25:")
print(s[s > 25])




Original Series:
a    10
b    20
c    30
d    40
e    50
dtype: int64

Element at index 'c': 30

Series after adding 5:
a    15
b    25
c    35
d    45
e    55
dtype: int64

Elements greater than 25:
c    30
d    40
e    50
dtype: int64


In [19]:
import pandas as pd

# ------------------------------------------------------------------------------
# 🧮 Series Arithmetic - Matching Indexes
# ------------------------------------------------------------------------------

# Two Series with exactly matching indexes
s1_match = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s2_match = pd.Series([1, 2, 3], index=['a', 'b', 'c'])



# Arithmetic operations
print("\nAddition:")
print(s1_match + s2_match)

print("\nSubtraction:")
print(s1_match - s2_match)

print("\nMultiplication:")
print(s1_match * s2_match)

print("\nDivision:")
print(s1_match / s2_match)


# ------------------------------------------------------------------------------
# ⚠️ Series Arithmetic - Non-Matching Indexes
# ------------------------------------------------------------------------------

# Two Series with different indexes (partial overlap)
s1_mismatch = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s2_mismatch = pd.Series([1, 2, 3], index=['b', 'c', 'd'])



# Arithmetic operations result in NaN for unmatched indexes
print("\nAddition:")
print(s1_mismatch + s2_mismatch)

print("\nSubtraction:")
print(s1_mismatch - s2_mismatch)

print("\nMultiplication:")
print(s1_mismatch * s2_mismatch)

print("\nDivision:")
print(s1_mismatch / s2_mismatch)

# Using fill_value to handle NaNs by treating missing as 0
print("\nAddition with fill_value=0:")
print(s1_mismatch.add(s2_mismatch, fill_value=0))



Addition:
a    11
b    22
c    33
dtype: int64

Subtraction:
a     9
b    18
c    27
dtype: int64

Multiplication:
a    10
b    40
c    90
dtype: int64

Division:
a    10.0
b    10.0
c    10.0
dtype: float64

Addition:
a     NaN
b    21.0
c    32.0
d     NaN
dtype: float64

Subtraction:
a     NaN
b    19.0
c    28.0
d     NaN
dtype: float64

Multiplication:
a     NaN
b    20.0
c    60.0
d     NaN
dtype: float64

Division:
a     NaN
b    20.0
c    15.0
d     NaN
dtype: float64

Addition with fill_value=0:
a    10.0
b    21.0
c    32.0
d     3.0
dtype: float64


In [20]:
# ------------------------------------------------------------------------------
# Explanation of Statistical Functions in Pandas Series
# ------------------------------------------------------------------------------

# sum(): 
#   Returns the sum of all elements in the Series.

# mean(): 
#   Returns the average (mean) of the elements in the Series.

# median(): 
#   Returns the median (middle value when sorted) of the elements in the Series.

# std(): 
#   Returns the standard deviation, which shows how spread out the values are in the Series.

# var(): 
#   Returns the variance, which is the square of the standard deviation of the Series.

# min(): 
#   Returns the smallest element in the Series.

# max(): 
#   Returns the largest element in the Series.

# count(): 
#   Returns the number of non-null elements in the Series.

# describe(): 
#   Provides a summary of the Series with key statistics:
#   - count: Number of non-null elements
#   - mean: Average of the elements
#   - std: Standard deviation
#   - min: Minimum value
#   - 25%: 25th percentile
#   - 50%: Median (50th percentile)
#   - 75%: 75th percentile
#   - max: Maximum value
# ------------------------------------------------------------------------------



In [21]:
import pandas as pd

# ------------------------------------------------------------------------------
# Creating a Series for Statistical Operations
# ------------------------------------------------------------------------------

# Create a simple Series
s = pd.Series([10, 20, 30, 40, 50])

print("Series:")
print(s)

# ------------------------------------------------------------------------------
# Statistical Operations on Series
# ------------------------------------------------------------------------------

# Sum of the elements
sum_val = s.sum()
print("\nSum of Series:", sum_val)

# Mean (average) of the elements
mean_val = s.mean()
print("Mean of Series:", mean_val)

# Median of the elements
median_val = s.median()
print("Median of Series:", median_val)

# Standard deviation of the elements
std_dev = s.std()
print("Standard Deviation of Series:", std_dev)

# Variance of the elements
variance_val = s.var()
print("Variance of Series:", variance_val)

# Minimum value in the Series
min_val = s.min()
print("Minimum value of Series:", min_val)

# Maximum value in the Series
max_val = s.max()
print("Maximum value of Series:", max_val)

# Count of non-null elements
count_val = s.count()
print("Count of non-null elements in Series:", count_val)

# Description of basic statistical properties
print("\nDescription of Series:")
print(s.describe())


Series:
0    10
1    20
2    30
3    40
4    50
dtype: int64

Sum of Series: 150
Mean of Series: 30.0
Median of Series: 30.0
Standard Deviation of Series: 15.811388300841896
Variance of Series: 250.0
Minimum value of Series: 10
Maximum value of Series: 50
Count of non-null elements in Series: 5

Description of Series:
count     5.000000
mean     30.000000
std      15.811388
min      10.000000
25%      20.000000
50%      30.000000
75%      40.000000
max      50.000000
dtype: float64


<h1>Lambda Function in Series

In [22]:
import pandas as pd

# Create a Series
s = pd.Series([10, 20, 30, 40, 50])

# Apply a lambda function to each element: multiply each value by 2
result = s.apply(lambda x: x * 2)

print("Original Series:")
print(s)

print("\nSeries after applying lambda function (x * 2):")
print(result)


Original Series:
0    10
1    20
2    30
3    40
4    50
dtype: int64

Series after applying lambda function (x * 2):
0     20
1     40
2     60
3     80
4    100
dtype: int64


In [23]:
# Create a Series
s = pd.Series([10, 20, 30, 40, 50])

# Apply a lambda function to categorize the numbers
result = s.apply(lambda x: 'High' if x > 30 else 'Low')

print("\nSeries with conditional categories (High/Low):")
print(result)



Series with conditional categories (High/Low):
0     Low
1     Low
2     Low
3    High
4    High
dtype: object


In [24]:
# Create a Series
s = pd.Series([10, 20, 30, 40, 50])

# Map function with a lambda that adds 5 to each element
result = s.map(lambda x: x + 5)

print("\nSeries after using map with lambda (x + 5):")
print(result)



Series after using map with lambda (x + 5):
0    15
1    25
2    35
3    45
4    55
dtype: int64


Reading CSV

In [25]:
iris = pd.read_csv('C:/Users/bened/Documents/Alyster Coding/CODING/AIML/Datasets/iris/Iris.csv')

In [26]:
type(iris)

pandas.core.frame.DataFrame

In [27]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [28]:
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [29]:
iris.tail()
#last 5 values of the dataset

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [30]:
iris.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [31]:
iris.index

RangeIndex(start=0, stop=150, step=1)

In [32]:
iris.columns = ['ID','Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']

In [33]:
iris.columns

Index(['ID', 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width',
       'Species'],
      dtype='object')

In [34]:
iris.Sepal_Length

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: Sepal_Length, Length: 150, dtype: float64

In [35]:
iris[['Petal_Length', 'Petal_Width']]

Unnamed: 0,Petal_Length,Petal_Width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


In [36]:
import pandas as pd

# Original Series with default index
s = pd.Series([100, 200, 300], index=['a', 'b', 'c'])
print("Original Series:")
print(s)

# Change index names
s.index = ['x', 'y', 'z']
print("\nSeries with Changed Index Names:")
print(s)


Original Series:
a    100
b    200
c    300
dtype: int64

Series with Changed Index Names:
x    100
y    200
z    300
dtype: int64


In [37]:
#inplace is used to change the original series else it will create a new series

s.rename(index={'x': 'first', 'y': 'second', 'z': 'third'}, inplace=True)
print("\nSeries with Renamed Index:")
print(s)


Series with Renamed Index:
first     100
second    200
third     300
dtype: int64


In [38]:
import pandas as pd

# Create a simple DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

print("Original DataFrame:")
print(df)

# Rename columns
df.rename(columns={'A': 'Column1', 'B': 'Column2'}, inplace=True)
print("\nDataFrame with Renamed Columns:")
print(df)


Original DataFrame:
   A  B
0  1  4
1  2  5
2  3  6

DataFrame with Renamed Columns:
   Column1  Column2
0        1        4
1        2        5
2        3        6


In [39]:
exam_data={'math':[90,80,70],'eng':[60,70,80],'science':[50,60,70],'music':[40,50,60]}
exam_data

{'math': [90, 80, 70],
 'eng': [60, 70, 80],
 'science': [50, 60, 70],
 'music': [40, 50, 60]}

In [40]:
df=pd.DataFrame(exam_data,index=['student1','student2','student3'])
df

Unnamed: 0,math,eng,science,music
student1,90,60,50,40
student2,80,70,60,50
student3,70,80,70,60


In [41]:
df2=df[:] #copying the dataframe to another variable

df2.drop('student2',inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop('student2',inplace=True)


In [42]:
df2

Unnamed: 0,math,eng,science,music
student1,90,60,50,40
student3,70,80,70,60


In [43]:
df #original reamins the same

Unnamed: 0,math,eng,science,music
student1,90,60,50,40
student2,80,70,60,50
student3,70,80,70,60


In [44]:
df3=df[:]
df3.drop('math',axis=1,inplace=True) #axis=1 means column
df3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop('math',axis=1,inplace=True) #axis=1 means column


Unnamed: 0,eng,science,music
student1,60,50,40
student2,70,60,50
student3,80,70,60


In [45]:
df #original remains the same

Unnamed: 0,math,eng,science,music
student1,90,60,50,40
student2,80,70,60,50
student3,70,80,70,60


In [46]:
#use the loc method to access a row by label
#use the iloc method to access a row by position
#loc is used to access a group of rows and columns by labels or a boolean array
#iloc is used to access a group of rows and columns by integer position(s)
label1=df.loc['student1']
position1=df.iloc[0]

In [47]:
label1

math       90
eng        60
science    50
music      40
Name: student1, dtype: int64

In [48]:
position1

math       90
eng        60
science    50
music      40
Name: student1, dtype: int64

In [49]:
label2=df.loc[['student1','student2']] #accessing multiple rows

In [50]:
label2

Unnamed: 0,math,eng,science,music
student1,90,60,50,40
student2,80,70,60,50


In [51]:
position2=df.iloc[[0,1]]

In [52]:
position2

Unnamed: 0,math,eng,science,music
student1,90,60,50,40
student2,80,70,60,50


Selecting COlumns

In [53]:
english=df.eng #accessing a column
english

student1    60
student2    70
student3    80
Name: eng, dtype: int64

In [54]:
music_sci=df[['music','science']]#accessing multiple columns
music_sci

Unnamed: 0,music,science
student1,40,50
student2,50,60
student3,60,70


Adding Columns

In [55]:
df['kor']=[80,90,100]
df

Unnamed: 0,math,eng,science,music,kor
student1,90,60,50,40,80
student2,80,70,60,50,90
student3,70,80,70,60,100


Adding Rows

In [56]:
df.loc[3]=[38 , 98,56,45,78]
df


Unnamed: 0,math,eng,science,music,kor
student1,90,60,50,40,80
student2,80,70,60,50,90
student3,70,80,70,60,100
3,38,98,56,45,78


In [57]:
df.index=['student1','student2','student3','student4']
df

Unnamed: 0,math,eng,science,music,kor
student1,90,60,50,40,80
student2,80,70,60,50,90
student3,70,80,70,60,100
student4,38,98,56,45,78


Setting a column as an index

In [58]:
df['name']=['alvin','alex','tanay','alyster']
df.set_index('name',inplace=True)
df

Unnamed: 0_level_0,math,eng,science,music,kor
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
alvin,90,60,50,40,80
alex,80,70,60,50,90
tanay,70,80,70,60,100
alyster,38,98,56,45,78


Changing the dataframe Element

In [59]:
df.iloc[0,1]=100
df

Unnamed: 0_level_0,math,eng,science,music,kor
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
alvin,90,100,50,40,80
alex,80,70,60,50,90
tanay,70,80,70,60,100
alyster,38,98,56,45,78


In [60]:
df.loc['alvin','math']=100
df

Unnamed: 0_level_0,math,eng,science,music,kor
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
alvin,100,100,50,40,80
alex,80,70,60,50,90
tanay,70,80,70,60,100
alyster,38,98,56,45,78


In [61]:
df.reset_index(inplace=True)
df

Unnamed: 0,name,math,eng,science,music,kor
0,alvin,100,100,50,40,80
1,alex,80,70,60,50,90
2,tanay,70,80,70,60,100
3,alyster,38,98,56,45,78


Transpose

In [62]:
dff=df.transpose()
dff

Unnamed: 0,0,1,2,3
name,alvin,alex,tanay,alyster
math,100,80,70,38
eng,100,70,80,98
science,50,60,70,56
music,40,50,60,45
kor,80,90,100,78


In [63]:
dfff=dff.T
dfff

Unnamed: 0,name,math,eng,science,music,kor
0,alvin,100,100,50,40,80
1,alex,80,70,60,50,90
2,tanay,70,80,70,60,100
3,alyster,38,98,56,45,78


Multiple Index

In [64]:
import pandas as pd

# Original DataFrame
dfm = pd.DataFrame({
    'name': ['alvin', 'alex', 'tanay', 'alyster'],
    'math': [100, 80, 70, 38],
    'eng': [100, 70, 80, 98],
    'science': [50, 60, 70, 56],
    'music': [40, 50, 60, 45],
    'kor': [80, 90, 100, 78],
    'age': [15, 16, 15, 17]
})

# Create a new DataFrame with MultiIndex
df_multi = dfm.set_index(['name', 'age'])

# Show the new DataFrame
print(df_multi)


             math  eng  science  music  kor
name    age                                
alvin   15    100  100       50     40   80
alex    16     80   70       60     50   90
tanay   15     70   80       70     60  100
alyster 17     38   98       56     45   78


Sort Index

In [65]:
df_multi.sort_index(inplace=True) #sort the index in ascending order
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,math,eng,science,music,kor
name,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
alex,16,80,70,60,50,90
alvin,15,100,100,50,40,80
alyster,17,38,98,56,45,78
tanay,15,70,80,70,60,100


Sorting wr to a particular column

In [66]:
df_multi.sort_index(level='age',ascending=True,inplace=True)
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,math,eng,science,music,kor
name,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
alvin,15,100,100,50,40,80
tanay,15,70,80,70,60,100
alex,16,80,70,60,50,90
alyster,17,38,98,56,45,78


Descending order

In [67]:
df_multi.sort_index(level='age',ascending=False,inplace=True)
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,math,eng,science,music,kor
name,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
alyster,17,38,98,56,45,78
alex,16,80,70,60,50,90
tanay,15,70,80,70,60,100
alvin,15,100,100,50,40,80


<h1>Series Operations

series operations applies to every element of the series and returns the resultant

In [68]:
import numpy as np

In [69]:
student1=pd.Series({'kor':100,'eng':80,'math':90,'science':70,'music':60})
student1

kor        100
eng         80
math        90
science     70
music       60
dtype: int64

In [70]:
percentage=student1/200
percentage

kor        0.50
eng        0.40
math       0.45
science    0.35
music      0.30
dtype: float64

In [71]:
student2=pd.Series({'eng':80,'math':90,'science':70,'kor':100,'music':60})

addition=student1+student2
sub=student1-student2
mul=student1*student2
div=student1/student2

In [72]:
print(f'add {addition} \n sub {sub} \n mul {mul} \n div {div}')

add eng        160
kor        200
math       180
music      120
science    140
dtype: int64 
 sub eng        0
kor        0
math       0
music      0
science    0
dtype: int64 
 mul eng         6400
kor        10000
math        8100
music       3600
science     4900
dtype: int64 
 div eng        1.0
kor        1.0
math       1.0
music      1.0
science    1.0
dtype: float64


In [73]:
result=pd.DataFrame([addition,sub,mul,div],index=['addition','subtraction','multiplication','division'])
result
#combining the results of series into a dataframe

Unnamed: 0,eng,kor,math,music,science
addition,160.0,200.0,180.0,120.0,140.0
subtraction,0.0,0.0,0.0,0.0,0.0
multiplication,6400.0,10000.0,8100.0,3600.0,4900.0
division,1.0,1.0,1.0,1.0,1.0


with missing values

In [74]:
student3=pd.Series({'eng':80,'math':90,'science':70,'music':60})
addition=student1+student3
sub=student1-student3
mul=student1*student3
div=student1/student3
result3=pd.DataFrame([addition,sub,mul,div],index=['addition','subtraction','multiplication','division'])
result3

Unnamed: 0,eng,kor,math,music,science
addition,160.0,,180.0,120.0,140.0
subtraction,0.0,,0.0,0.0,0.0
multiplication,6400.0,,8100.0,3600.0,4900.0
division,1.0,,1.0,1.0,1.0


to remove the Nan , use the Operation Method

In [75]:
sr_add=student1.add(student3,fill_value=0) #adding the two series


In [76]:
sr_sub=student1.sub(student3,fill_value=0) #subing the two series

In [77]:
sr_div=student1.div(student3,fill_value=0) #diving the two series

In [78]:
sr_mul=student1.mul(student3,fill_value=0) #muling the two series

In [79]:
result4=pd.DataFrame([sr_add,sr_sub,sr_mul,sr_div],index=['addition','subtraction','multiplication','division'])
result4

Unnamed: 0,eng,kor,math,music,science
addition,160.0,100.0,180.0,120.0,140.0
subtraction,0.0,100.0,0.0,0.0,0.0
multiplication,6400.0,0.0,8100.0,3600.0,4900.0
division,1.0,inf,1.0,1.0,1.0


<h1>DataFrame Operations

In [80]:
import seaborn as sns
iris=sns.load_dataset('iris')

In [81]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [82]:
#selecting specific columns from the irirs dataset
df=iris.loc[:,['sepal_length','sepal_width']]  #all rows and specific columns
df.head()

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


while maintaining the form of the existing dataframe , only the element value is replaced with a new
calculated value and returned as a new DataFrame object

In [83]:
addition=df+10
addition.head()

Unnamed: 0,sepal_length,sepal_width
0,15.1,13.5
1,14.9,13.0
2,14.7,13.2
3,14.6,13.1
4,15.0,13.6


In [84]:
subtraction=df-10
subtraction.iloc[50:100]



Unnamed: 0,sepal_length,sepal_width
50,-3.0,-6.8
51,-3.6,-6.8
52,-3.1,-6.9
53,-4.5,-7.7
54,-3.5,-7.2
55,-4.3,-7.2
56,-3.7,-6.7
57,-5.1,-7.6
58,-3.4,-7.1
59,-4.8,-7.3


If a value is missing ,Nan is used inplace of the result of the arithmetic operation

<h1>Merging and Binding dataframes

In [85]:
import pandas as pd


# Personal Information DataFrame
df_personal = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Grace', 'Hannah'],
    'gender': ['Female', 'Male', 'Male', 'Male', 'Female', 'Female'],
    'age': [25, 30, 28, 35, 32, 29]
})

# Job Information DataFrame
df_job = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Eve', 'Frank', 'Grace', 'Ivan'],
    'position': ['Engineer', 'Manager', 'Analyst', 'Designer', 'HR', 'Technician'],
    'wage': [75000, 85000, 62000, 70000, 68000, 72000]
})


In [86]:
inner=df_personal.merge(df_job, on='name', how='inner')
outer=df_personal.merge(df_job, on='name', how='outer')
right=df_personal.merge(df_job, on='name', how='right')
left=df_personal.merge(df_job, on='name', how='left')

print("Inner join:\n",inner,"\n")
print("Outer join:\n",outer,"\n")
print("left join:\n",left,"\n")
print("right join:\n",right,"\n")

Inner join:
     name  gender  age  position   wage
0  Alice  Female   25  Engineer  75000
1    Bob    Male   30   Manager  85000
2  Grace  Female   32        HR  68000 

Outer join:
       name  gender   age    position     wage
0    Alice  Female  25.0    Engineer  75000.0
1      Bob    Male  30.0     Manager  85000.0
2  Charlie    Male  28.0         NaN      NaN
3    David    Male  35.0         NaN      NaN
4      Eve     NaN   NaN     Analyst  62000.0
5    Frank     NaN   NaN    Designer  70000.0
6    Grace  Female  32.0          HR  68000.0
7   Hannah  Female  29.0         NaN      NaN
8     Ivan     NaN   NaN  Technician  72000.0 

left join:
       name  gender  age  position     wage
0    Alice  Female   25  Engineer  75000.0
1      Bob    Male   30   Manager  85000.0
2  Charlie    Male   28       NaN      NaN
3    David    Male   35       NaN      NaN
4    Grace  Female   32        HR  68000.0
5   Hannah  Female   29       NaN      NaN 

right join:
     name  gender   age    

Concat

In [87]:
#bind vertically by matching the column names

pd.concat([df_personal,df_job],axis=0) #concatenating the two dataframes


Unnamed: 0,name,gender,age,position,wage
0,Alice,Female,25.0,,
1,Bob,Male,30.0,,
2,Charlie,Male,28.0,,
3,David,Male,35.0,,
4,Grace,Female,32.0,,
5,Hannah,Female,29.0,,
0,Alice,,,Engineer,75000.0
1,Bob,,,Manager,85000.0
2,Eve,,,Analyst,62000.0
3,Frank,,,Designer,70000.0


In [88]:
#bind horizontally by matching the indices
pd.concat([df_personal,df_job],axis=1) #concatenating the two dataframes

Unnamed: 0,name,gender,age,name.1,position,wage
0,Alice,Female,25,Alice,Engineer,75000
1,Bob,Male,30,Bob,Manager,85000
2,Charlie,Male,28,Eve,Analyst,62000
3,David,Male,35,Frank,Designer,70000
4,Grace,Female,32,Grace,HR,68000
5,Hannah,Female,29,Ivan,Technician,72000


In [89]:
#if the axis is not specified , it will be concatenated vertically by default
#Without ignore_index: keeps the original row indexes (can lead to duplicate index values).

#With ignore_index=True: resets the index to 0, 1, 2, … for the new DataFrame.


pd.concat([df_personal,df_job],axis=0,ignore_index=True) #concatenating the two dataframes
#using the join parameter to specify how to join the dataframes


Unnamed: 0,name,gender,age,position,wage
0,Alice,Female,25.0,,
1,Bob,Male,30.0,,
2,Charlie,Male,28.0,,
3,David,Male,35.0,,
4,Grace,Female,32.0,,
5,Hannah,Female,29.0,,
6,Alice,,,Engineer,75000.0
7,Bob,,,Manager,85000.0
8,Eve,,,Analyst,62000.0
9,Frank,,,Designer,70000.0


<h1>Dataframe Manipulation


In [None]:
#it is possible to sort the rows of a dataframe using one or more columns
import pandas as pd

# Sample data
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 35, 28],
    'bloodtype': ['B', 'O', 'A', 'AB']
}

# Create DataFrame
df = pd.DataFrame(data)

# Sort by bloodtype
sorted_df = df.sort_values(by='bloodtype')

# Display the sorted DataFrame
print(sorted_df)



      name  age bloodtype
2  Charlie   35         A
3    David   28        AB
0    Alice   25         B
1      Bob   30         O


Unnamed: 0,name,age,bloodtype
2,Charlie,35,A
3,David,28,AB
0,Alice,25,B
1,Bob,30,O


In [96]:
#descending order values
sorted_df_desc=df.sort_values(by='bloodtype', ascending=False)
print(sorted_df_desc)

      name  age bloodtype
1      Bob   30         O
0    Alice   25         B
3    David   28        AB
2  Charlie   35         A


In [103]:
#sort using two columns

#The age column is used only when there are ties in the bloodtype values — that is, when multiple rows have the same blood type, it will then sort them by age.

sorted_df_multi=df.sort_values(by=['bloodtype','age'],ascending=True)
print(sorted_df_multi)

      name  age bloodtype
2  Charlie   35         A
3    David   28        AB
0    Alice   25         B
1      Bob   30         O


In [115]:
#hiearrchical Indexing

# Define column headers for the DataFrame
# Create outer index labels: ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
# Create inner index labels: [1, 2, 3, 1, 2, 3]
# Combine outer and inner index into tuple pairs using zip
# Create a MultiIndex from the tuple pairs for hierarchical indexing
# Create a DataFrame with random data, hierarchical index, and column headers
# Resulting DataFrame has two levels of row indexes (MultiIndex)
# Level 0: Outer group (e.g., G1, G2)
# Level 1: Inner subgroup (e.g., 1, 2, 3)
# This allows advanced data organization and multi-level access


my_header=['a','b','c']
my_index_out=['G1']*3 + ['G2']*3
my_index_in=[1,2,3]*2
my_index_zipped = list(zip(my_index_out, my_index_in))
my_index = pd.MultiIndex.from_tuples(my_index_zipped)
dff=pd.DataFrame(data=np.random.randn(6,3),index=my_index,columns=my_header)  ##random from noraml distribution with 6 rows and 3 columns

dff

Unnamed: 0,Unnamed: 1,a,b,c
G1,1,1.301323,1.360109,-0.388103
G1,2,1.45042,-1.259616,-2.43933
G1,3,-0.722891,-0.543939,-0.132258
G2,1,0.590993,0.772657,-0.400129
G2,2,-1.90889,1.08986,-0.341498
G2,3,-0.602121,1.077986,1.094075


<h1>Dataframe Summarization

In [4]:
import pandas as pd
df=pd.read_csv('C:/Users/bened/Documents/Alyster Coding/CODING/AIML/Datasets/iris/Iris.csv')

In [11]:
df.head(10)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
5,6,5.4,3.9,1.7,0.4,Iris-setosa
6,7,4.6,3.4,1.4,0.3,Iris-setosa
7,8,5.0,3.4,1.5,0.2,Iris-setosa
8,9,4.4,2.9,1.4,0.2,Iris-setosa
9,10,4.9,3.1,1.5,0.1,Iris-setosa


grouping and summarizing

In [None]:
df.groupby(['Species','PetalWidthCm'])['PetalLengthCm'].mean()

#this means that among all the Iris-setosa flowers that have a PetalWidth of 0.2 cm,The average PetalLength is 1.442857 cm.

Species          PetalWidthCm
Iris-setosa      0.1             1.416667
                 0.2             1.442857
                 0.3             1.428571
                 0.4             1.571429
                 0.5             1.700000
                 0.6             1.600000
Iris-versicolor  1.0             3.628571
                 1.1             3.566667
                 1.2             4.240000
                 1.3             4.176923
                 1.4             4.500000
                 1.5             4.580000
                 1.6             4.766667
                 1.7             5.000000
                 1.8             4.800000
Iris-virginica   1.4             5.600000
                 1.5             5.050000
                 1.6             5.800000
                 1.7             4.500000
                 1.8             5.381818
                 1.9             5.320000
                 2.0             5.550000
                 2.1             5.783333
    

<h1>pivoting

Manipulate the indices and the columns and then summarise

In [16]:
my_dict={'Size': ['L','L','M','M','M','S','S','S','S'],
         'Type': ['A','A','A','B','B','A','A','B','B'],
         'Location': ['L1','L1','L1','L2','L2','L1','L2','L2','L1'],
         'A': [1,2,2,3,3,4,5,6,7],
         'B': [2,4,5,5,6,6,8,9,9]}
df=pd.DataFrame(my_dict)
df

Unnamed: 0,Size,Type,Location,A,B
0,L,A,L1,1,2
1,L,A,L1,2,4
2,M,A,L1,2,5
3,M,B,L2,3,5
4,M,B,L2,3,6
5,S,A,L1,4,6
6,S,A,L2,5,8
7,S,B,L2,6,9
8,S,B,L1,7,9


![image.png](attachment:image.png)

In [17]:
dfr=pd.pivot_table(df,index=['Size','Type'],columns='Location',values='B')
dfr

Unnamed: 0_level_0,Location,L1,L2
Size,Type,Unnamed: 2_level_1,Unnamed: 3_level_1
L,A,3.0,
M,A,5.0,
M,B,,5.5
S,A,6.0,8.0
S,B,9.0,9.0


In [19]:
dfr.columns

Index(['L1', 'L2'], dtype='object', name='Location')

In [21]:
dfr.index

MultiIndex([('L', 'A'),
            ('M', 'A'),
            ('M', 'B'),
            ('S', 'A'),
            ('S', 'B')],
           names=['Size', 'Type'])

fill value argument

In [24]:
pd.pivot_table(df,index=['Size','Type'],columns='Location',values='B',fill_value=0)

Unnamed: 0_level_0,Location,L1,L2
Size,Type,Unnamed: 2_level_1,Unnamed: 3_level_1
L,A,3.0,0.0
M,A,5.0,0.0
M,B,0.0,5.5
S,A,6.0,8.0
S,B,9.0,9.0
