# **IDS Lab 03 (Pandas)**

 **Ahmad Faisal**
 
  **BSDSF21A034**

* **Tast 01:** Data Inspection and Missing Value Handling

In [None]:
import pandas as pd

# Load the dataset
iris_data = pd.read_csv("Iris.csv")

# Inspect the dataset
print("Dataset Preview:")
print(iris_data.head())
print("\nDataset Information:")
print(iris_data.info())

# Check for missing values
missing_values = iris_data.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# Handle missing values in numeric columns
numeric_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
for col in numeric_columns:
    iris_data[col].fillna(iris_data[col].mean(), inplace=True)

# Handle missing values in the species column 
iris_data['Species'].fillna(iris_data['Species'].mode()[0], inplace=True)
print("\nMissing values handled.")


Dataset Preview:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
mem

* **Task 02:** Data Cleaning and Transformation

In [8]:
# Remove duplicate entries
before_dedup = iris_data.shape[0]
iris_data = iris_data.drop_duplicates()
after_dedup = iris_data.shape[0]
print(f"\nDuplicates Removed: {before_dedup - after_dedup}")

# Create a new column for petal area
iris_data['PetalArea'] = iris_data['PetalLengthCm'] * iris_data['PetalWidthCm']
print("\nNew Column 'PetalArea' Added.")
print("\n", iris_data.head())

# Drop rows with any remaining missing values
iris_data = iris_data.dropna()
print("\nRemaining Missing Values Dropped.")



Duplicates Removed: 0

New Column 'PetalArea' Added.

    Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species  \
0   1            5.1           3.5            1.4           0.2  Iris-setosa   
1   2            4.9           3.0            1.4           0.2  Iris-setosa   
2   3            4.7           3.2            1.3           0.2  Iris-setosa   
3   4            4.6           3.1            1.5           0.2  Iris-setosa   
4   5            5.0           3.6            1.4           0.2  Iris-setosa   

   PetalArea  
0       0.28  
1       0.28  
2       0.26  
3       0.30  
4       0.28  

Remaining Missing Values Dropped.


* **Task 03:** Aggregation and Transformation

In [15]:
# Convert species to numeric values
iris_data['SpeciesNumeric'] = iris_data['Species'].astype('category').cat.codes
print("\nSpecies Column Converted to Numeric.")
print("\n", iris_data.head())

# Group by species and calculate the mean of numeric columns
numeric_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'PetalArea']
grouped_means = iris_data[numeric_columns + ['Species']].groupby('Species').mean()
print("\nMean Values Grouped by Species:")
print(grouped_means)



Species Column Converted to Numeric.

    Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species  \
0   1            5.1           3.5            1.4           0.2  Iris-setosa   
1   2            4.9           3.0            1.4           0.2  Iris-setosa   
2   3            4.7           3.2            1.3           0.2  Iris-setosa   
3   4            4.6           3.1            1.5           0.2  Iris-setosa   
4   5            5.0           3.6            1.4           0.2  Iris-setosa   

   PetalArea  SpeciesNumeric  
0       0.28               0  
1       0.28               0  
2       0.26               0  
3       0.30               0  
4       0.28               0  

Mean Values Grouped by Species:
                 SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
Species                                                                     
Iris-setosa              5.006         3.418          1.464         0.244   
Iris-versicolor          5.93

* **Task 4:** Advanced Reshaping

In [17]:
# Reshape the dataset to a long format
iris_long = iris_data.melt(
    id_vars=['Species', 'SpeciesNumeric'],
    value_vars=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'],
    var_name='MeasurementType',
    value_name='MeasurementValue'
)

print("\nReshaped Dataset (Long Format):")
print(iris_long)



Reshaped Dataset (Long Format):
            Species  SpeciesNumeric MeasurementType  MeasurementValue
0       Iris-setosa               0   SepalLengthCm               5.1
1       Iris-setosa               0   SepalLengthCm               4.9
2       Iris-setosa               0   SepalLengthCm               4.7
3       Iris-setosa               0   SepalLengthCm               4.6
4       Iris-setosa               0   SepalLengthCm               5.0
..              ...             ...             ...               ...
595  Iris-virginica               2    PetalWidthCm               2.3
596  Iris-virginica               2    PetalWidthCm               1.9
597  Iris-virginica               2    PetalWidthCm               2.0
598  Iris-virginica               2    PetalWidthCm               2.3
599  Iris-virginica               2    PetalWidthCm               1.8

[600 rows x 4 columns]
