In [2]:
#pandas piplelines 

# ------------------------------------------------------------------------------
# Pandas Series Overview
# ------------------------------------------------------------------------------
# A Pandas Series is a one-dimensional labeled array capable of holding any data type.
# It is similar to a column in a spreadsheet or database table.
#
# Example:
# import pandas as pd
# s = pd.Series([100, 200, 300], index=['a', 'b', 'c'])
#
# Features:
# - Labeled Indexing:
#     You can access elements by custom labels (e.g., s['b'] gives 200).
# - Vectorized Operations:
#     Supports operations like s + 10 or s * 2 applied to each element.
# - Data Types:
#     Can hold integers, floats, strings, or even Python objects.
# - Dictionary Input:
#     Series can be created from a dictionary, where keys become index labels.
#     Example: pd.Series({'a': 10, 'b': 20})
#
# Use Cases:
# - When you need a single labeled column of data.
# - Acts as a building block for DataFrames (which are 2D structures).
# ------------------------------------------------------------------------------



In [3]:
#Creating Series using list

import pandas as pd

pd.Series(['Male','Female','Male','Male','Female','Female','Female'])


0      Male
1    Female
2      Male
3      Male
4    Female
5    Female
6    Female
dtype: object

In [4]:
#creating list using array
import numpy as np
ser=pd.Series(np.array(['Male','Female','Male','Male','Female','Female','Female']))
print(type(ser))
print(ser)

<class 'pandas.core.series.Series'>
0      Male
1    Female
2      Male
3      Male
4    Female
5    Female
6    Female
dtype: object


In [5]:
#creating series using dictionary

#key is used as the index and value=value
dict_data={'a':1,'b':2,'c':3}
ser1=pd.Series(dict_data)
print(ser1)
print()
print(ser1.index)
print(ser1.values)

a    1
b    2
c    3
dtype: int64

Index(['a', 'b', 'c'], dtype='object')
[1 2 3]


<h1>Dataframe

In [6]:
# ------------------------------------------------------------------------------
# Pandas DataFrame Overview
# ------------------------------------------------------------------------------
# A Pandas DataFrame is a 2-dimensional, labeled data structure similar to a table,
# spreadsheet, or SQL result set.
#
# Structure:
# - It consists of rows and columns.
# - Each column is essentially a Pandas Series.
# - Columns can hold different data types (int, float, string, etc.).
#
# Example:
# import pandas as pd
# data = {
#     'Name': ['Alice', 'Bob', 'Charlie'],
#     'Age': [25, 30, 22],
#     'City': ['New York', 'London', 'Paris']
# }
# df = pd.DataFrame(data)
#
# Common Operations:
# - df['Age']           -> Access a column (returns a Series)
# - df.loc[1]           -> Access row by label
# - df.iloc[1]          -> Access row by position
# - df['Age'].mean()    -> Calculate average of Age column
#
# Use Cases:
# - Storing and analyzing structured data
# - Importing/exporting data from CSV, Excel, SQL, etc.
# - Performing data cleaning, filtering, aggregation, and visualization
# ------------------------------------------------------------------------------


In [7]:
df=pd.DataFrame({'Name':["alyster","alvin","alex","tanay"],
              'Age':[23,24,25,26],
              'Sex': ["MALE","MALE","MALE","MALE"]})
print(df)

      Name  Age   Sex
0  alyster   23  MALE
1    alvin   24  MALE
2     alex   25  MALE
3    tanay   26  MALE


In [8]:
#extracting single column from the dataframe(i.e printing series)
print(df['Name'])
print()
print(df['Name'][2])

0    alyster
1      alvin
2       alex
3      tanay
Name: Name, dtype: object

alex


In [9]:
#dataframe when column name is not specified
#since column name is not specified , it apprears as Range Index object
pd.DataFrame([[123,456,789],[213,565,852],['akl','bsg','cfg']])

Unnamed: 0,0,1,2
0,123,456,789
1,213,565,852
2,akl,bsg,cfg


#series operations

In [10]:
eyes=pd.Series({'hazel':200,'blue':45,'green':789})
print(eyes)


hazel    200
blue      45
green    789
dtype: int64


In [11]:
#attributes of Series object
print(eyes.index)
eyes.name='eye_color'
print(eyes.name)


print(eyes.values)

print()
print(eyes.sort_values)
print(eyes.sort_index)

print(eyes.unique())
print(eyes.nunique())


Index(['hazel', 'blue', 'green'], dtype='object')
eye_color
[200  45 789]

<bound method Series.sort_values of hazel    200
blue      45
green    789
Name: eye_color, dtype: int64>
<bound method Series.sort_index of hazel    200
blue      45
green    789
Name: eye_color, dtype: int64>
[200  45 789]
3


In [12]:
#duplicate values

my_data2=[220,215,93,64,64]
eye2=pd.Series(data=my_data2,index=['brown','blue','blue','hazel','green'])
print(eye2)

brown    220
blue     215
blue      93
hazel     64
green     64
dtype: int64


In [13]:
eye2.unique()

array([220, 215,  93,  64], dtype=int64)

In [14]:
eye2.value_counts()

64     2
220    1
215    1
93     1
Name: count, dtype: int64

In [15]:
eye2.index

Index(['brown', 'blue', 'blue', 'hazel', 'green'], dtype='object')

In [16]:
eye2.nunique()

4

In [17]:
#Series can be indexed and sliced like numpy arrays
ser3=pd.Series([1,5,9,8,7,4,6,3,5,7,8],index=['a','b','c','d','e','f','g','h','i','j','k'])


print(ser3['d']) #value at index d
print()
print(ser3[1:4]) #values from index 1 to 4

8

b    5
c    9
d    8
dtype: int64


In [18]:
#Series Operations:
# - Arithmetic operations (addition, subtraction, etc.) are vectorized.
# - You can perform operations on Series with different lengths.
import pandas as pd

# Create a Series with custom index
s = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])

# Display original series
print("Original Series:")
print(s)

# Accessing elements
print("\nElement at index 'c':", s['c'])

# Vectorized arithmetic operation (add 5 to each element)
print("\nSeries after adding 5:")
print(s + 5)

# Filtering elements greater than 25
print("\nElements greater than 25:")
print(s[s > 25])




Original Series:
a    10
b    20
c    30
d    40
e    50
dtype: int64

Element at index 'c': 30

Series after adding 5:
a    15
b    25
c    35
d    45
e    55
dtype: int64

Elements greater than 25:
c    30
d    40
e    50
dtype: int64


In [19]:
import pandas as pd

# ------------------------------------------------------------------------------
# 🧮 Series Arithmetic - Matching Indexes
# ------------------------------------------------------------------------------

# Two Series with exactly matching indexes
s1_match = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s2_match = pd.Series([1, 2, 3], index=['a', 'b', 'c'])



# Arithmetic operations
print("\nAddition:")
print(s1_match + s2_match)

print("\nSubtraction:")
print(s1_match - s2_match)

print("\nMultiplication:")
print(s1_match * s2_match)

print("\nDivision:")
print(s1_match / s2_match)


# ------------------------------------------------------------------------------
# ⚠️ Series Arithmetic - Non-Matching Indexes
# ------------------------------------------------------------------------------

# Two Series with different indexes (partial overlap)
s1_mismatch = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s2_mismatch = pd.Series([1, 2, 3], index=['b', 'c', 'd'])



# Arithmetic operations result in NaN for unmatched indexes
print("\nAddition:")
print(s1_mismatch + s2_mismatch)

print("\nSubtraction:")
print(s1_mismatch - s2_mismatch)

print("\nMultiplication:")
print(s1_mismatch * s2_mismatch)

print("\nDivision:")
print(s1_mismatch / s2_mismatch)

# Using fill_value to handle NaNs by treating missing as 0
print("\nAddition with fill_value=0:")
print(s1_mismatch.add(s2_mismatch, fill_value=0))



Addition:
a    11
b    22
c    33
dtype: int64

Subtraction:
a     9
b    18
c    27
dtype: int64

Multiplication:
a    10
b    40
c    90
dtype: int64

Division:
a    10.0
b    10.0
c    10.0
dtype: float64

Addition:
a     NaN
b    21.0
c    32.0
d     NaN
dtype: float64

Subtraction:
a     NaN
b    19.0
c    28.0
d     NaN
dtype: float64

Multiplication:
a     NaN
b    20.0
c    60.0
d     NaN
dtype: float64

Division:
a     NaN
b    20.0
c    15.0
d     NaN
dtype: float64

Addition with fill_value=0:
a    10.0
b    21.0
c    32.0
d     3.0
dtype: float64


In [20]:
# ------------------------------------------------------------------------------
# Explanation of Statistical Functions in Pandas Series
# ------------------------------------------------------------------------------

# sum(): 
#   Returns the sum of all elements in the Series.

# mean(): 
#   Returns the average (mean) of the elements in the Series.

# median(): 
#   Returns the median (middle value when sorted) of the elements in the Series.

# std(): 
#   Returns the standard deviation, which shows how spread out the values are in the Series.

# var(): 
#   Returns the variance, which is the square of the standard deviation of the Series.

# min(): 
#   Returns the smallest element in the Series.

# max(): 
#   Returns the largest element in the Series.

# count(): 
#   Returns the number of non-null elements in the Series.

# describe(): 
#   Provides a summary of the Series with key statistics:
#   - count: Number of non-null elements
#   - mean: Average of the elements
#   - std: Standard deviation
#   - min: Minimum value
#   - 25%: 25th percentile
#   - 50%: Median (50th percentile)
#   - 75%: 75th percentile
#   - max: Maximum value
# ------------------------------------------------------------------------------



In [21]:
import pandas as pd

# ------------------------------------------------------------------------------
# Creating a Series for Statistical Operations
# ------------------------------------------------------------------------------

# Create a simple Series
s = pd.Series([10, 20, 30, 40, 50])

print("Series:")
print(s)

# ------------------------------------------------------------------------------
# Statistical Operations on Series
# ------------------------------------------------------------------------------

# Sum of the elements
sum_val = s.sum()
print("\nSum of Series:", sum_val)

# Mean (average) of the elements
mean_val = s.mean()
print("Mean of Series:", mean_val)

# Median of the elements
median_val = s.median()
print("Median of Series:", median_val)

# Standard deviation of the elements
std_dev = s.std()
print("Standard Deviation of Series:", std_dev)

# Variance of the elements
variance_val = s.var()
print("Variance of Series:", variance_val)

# Minimum value in the Series
min_val = s.min()
print("Minimum value of Series:", min_val)

# Maximum value in the Series
max_val = s.max()
print("Maximum value of Series:", max_val)

# Count of non-null elements
count_val = s.count()
print("Count of non-null elements in Series:", count_val)

# Description of basic statistical properties
print("\nDescription of Series:")
print(s.describe())


Series:
0    10
1    20
2    30
3    40
4    50
dtype: int64

Sum of Series: 150
Mean of Series: 30.0
Median of Series: 30.0
Standard Deviation of Series: 15.811388300841896
Variance of Series: 250.0
Minimum value of Series: 10
Maximum value of Series: 50
Count of non-null elements in Series: 5

Description of Series:
count     5.000000
mean     30.000000
std      15.811388
min      10.000000
25%      20.000000
50%      30.000000
75%      40.000000
max      50.000000
dtype: float64


<h1>Lambda Function in Series

In [22]:
import pandas as pd

# Create a Series
s = pd.Series([10, 20, 30, 40, 50])

# Apply a lambda function to each element: multiply each value by 2
result = s.apply(lambda x: x * 2)

print("Original Series:")
print(s)

print("\nSeries after applying lambda function (x * 2):")
print(result)


Original Series:
0    10
1    20
2    30
3    40
4    50
dtype: int64

Series after applying lambda function (x * 2):
0     20
1     40
2     60
3     80
4    100
dtype: int64


In [23]:
# Create a Series
s = pd.Series([10, 20, 30, 40, 50])

# Apply a lambda function to categorize the numbers
result = s.apply(lambda x: 'High' if x > 30 else 'Low')

print("\nSeries with conditional categories (High/Low):")
print(result)



Series with conditional categories (High/Low):
0     Low
1     Low
2     Low
3    High
4    High
dtype: object


In [24]:
# Create a Series
s = pd.Series([10, 20, 30, 40, 50])

# Map function with a lambda that adds 5 to each element
result = s.map(lambda x: x + 5)

print("\nSeries after using map with lambda (x + 5):")
print(result)



Series after using map with lambda (x + 5):
0    15
1    25
2    35
3    45
4    55
dtype: int64


Reading CSV

In [26]:
iris = pd.read_csv('C:/Users/bened/Documents/Alyster Coding/CODING/AIML/Datasets/iris/Iris.csv')

In [27]:
type(iris)

pandas.core.frame.DataFrame

In [29]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [31]:
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [32]:
iris.tail()
#last 5 values of the dataset

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [35]:
iris.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [39]:
iris.index

RangeIndex(start=0, stop=150, step=1)

In [41]:
iris.columns = ['ID','Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']

In [42]:
iris.columns

Index(['ID', 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width',
       'Species'],
      dtype='object')

In [43]:
iris.Sepal_Length

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: Sepal_Length, Length: 150, dtype: float64

In [46]:
iris[['Petal_Length', 'Petal_Width']]

Unnamed: 0,Petal_Length,Petal_Width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


In [48]:
import pandas as pd

# Original Series with default index
s = pd.Series([100, 200, 300], index=['a', 'b', 'c'])
print("Original Series:")
print(s)

# Change index names
s.index = ['x', 'y', 'z']
print("\nSeries with Changed Index Names:")
print(s)


Original Series:
a    100
b    200
c    300
dtype: int64

Series with Changed Index Names:
x    100
y    200
z    300
dtype: int64


In [None]:
#inplace is used to change the original series else it will create a new series

s.rename(index={'x': 'first', 'y': 'second', 'z': 'third'}, inplace=True)
print("\nSeries with Renamed Index:")
print(s)


Series with Renamed Index:
first     100
second    200
third     300
dtype: int64


In [58]:
import pandas as pd

# Create a simple DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

print("Original DataFrame:")
print(df)

# Rename columns
df.rename(columns={'A': 'Column1', 'B': 'Column2'}, inplace=True)
print("\nDataFrame with Renamed Columns:")
print(df)


Original DataFrame:
   A  B
0  1  4
1  2  5
2  3  6

DataFrame with Renamed Columns:
   Column1  Column2
0        1        4
1        2        5
2        3        6
