<a href="https://colab.research.google.com/github/Eshwar-Naidus/task-1/blob/main/Task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

print("--- Task 2: Exploratory Data Analysis (EDA) ---")

# --- 1. Import Dataset ---
# The Iris dataset is suggested and is built into the Seaborn library.
# This avoids needing to download a separate CSV file.
df = sns.load_dataset('iris')
print("Successfully loaded the Iris dataset.")

# --- 2. Basic Info & Descriptive Statistics ---
print("\n--- Basic Dataset Info (Nulls, Data Types) ---")
df.info()

print("\n--- Descriptive Statistics ---")
print(df.describe())

# --- 3. Visualize Distributions (Univariate) & 4. Relationships (Bivariate) ---
# A pairplot is the most efficient EDA tool for this.
# Diagonal: Histograms (Univariate analysis)
# Off-diagonal: Scatter plots (Bivariate analysis)
print("\nGenerating Pairplot (Univariate & Bivariate analysis)...")
pair_plot = sns.pairplot(df, hue='species', markers=["o", "s", "D"])
pair_plot.savefig('iris_pairplot.png')
print(f"Saved 'iris_pairplot.png' to {os.path.abspath('iris_pairplot.png')}")

# --- 4. Explore Relationships (Correlation Heatmap) ---
# Correlation only works on numerical features.
print("\nGenerating Correlation Heatmap...")
numerical_df = df.drop(columns=['species'])
corr_matrix = numerical_df.corr()

# Create a figure and axes
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix of Iris Features')
plt.savefig('iris_correlation_heatmap.png')
print(f"Saved 'iris_correlation_heatmap.png' to {os.path.abspath('iris_correlation_heatmap.png')}")

# --- 5. Analyze Categorical Data (Count Plot) ---
print("\nGenerating Categorical Count Plot...")
plt.figure(figsize=(6, 5))
sns.countplot(data=df, x='species', palette='viridis')
plt.title('Distribution of Iris Species')
plt.savefig('iris_species_countplot.png')
print(f"Saved 'iris_species_countplot.png' to {os.path.abspath('iris_species_countplot.png')}")

# --- 6. Identify Outliers (Boxplots) ---
# This is also a great way to compare numerical data across categories.
print("\nGenerating Boxplots (for Outlier Detection)...")
# We create a 2x2 grid for the 4 numerical features
plt.figure(figsize=(12, 10))

plt.subplot(2, 2, 1)
sns.boxplot(data=df, x='species', y='sepal_length')
plt.title('Sepal Length by Species')

plt.subplot(2, 2, 2)
sns.boxplot(data=df, x='species', y='sepal_width')
plt.title('Sepal Width by Species')

plt.subplot(2, 2, 3)
sns.boxplot(data=df, x='species', y='petal_length')
plt.title('Petal Length by Species')

plt.subplot(2, 2, 4)
sns.boxplot(data=df, x='species', y='petal_width')
plt.title('Petal Width by Species')

# This adjusts the plots to prevent labels from overlapping
plt.tight_layout()
plt.savefig('iris_boxplots.png')
print(f"Saved 'iris_boxplots.png' to {os.path.abspath('iris_boxplots.png')}")

print("\n--- EDA Task 2 Complete. All 4 plots have been saved. ---")

--- Task 2: Exploratory Data Analysis (EDA) ---
Successfully loaded the Iris dataset.

--- Basic Dataset Info (Nulls, Data Types) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

--- Descriptive Statistics ---
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.8000


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x='species', palette='viridis')
