# Data Loading, Analysis, and Visualization using Pandas and Matplotlib

This notebook fulfills the assignment requirements:

- **Task 1**: Load and explore dataset  
- **Task 2**: Basic data analysis  
- **Task 3**: Data visualization (Line chart, Bar chart, Histogram, Scatter plot)  

Dataset: **Iris dataset** (from sklearn)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

# Load dataset (Iris)
try:
    iris = load_iris(as_frame=True)  # load as dataframe
    df = iris.frame
    print("Dataset loaded successfully!\n")
except FileNotFoundError:
    print("Error: Dataset file not found.")
except Exception as e:
    print(f"An error occurred: {e}")

# Display first few rows
print("First 5 rows of dataset:")
print(df.head())

# Check structure and missing values
print("\nDataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

# Handle missing values (if any)
df = df.dropna()


In [None]:
# ===============================
# Task 2: Basic Data Analysis
# ===============================

# Basic statistics
print("\nDescriptive Statistics:")
print(df.describe())

# Grouping: mean of numerical columns per species
grouped = df.groupby("target").mean()
print("\nMean values grouped by species:")
print(grouped)

# Replace target numbers with species names for clarity
df["species"] = df["target"].map({i: name for i, name in enumerate(iris.target_names)})

# Pattern observations (example)
print("\nObservations:")
print("Setosa flowers generally have smaller petal length/width compared to Versicolor and Virginica.")
print("Virginica has the largest average petal and sepal sizes.")


In [None]:
# ===============================
# Task 3: Data Visualization
# ===============================

plt.style.use("seaborn-v0_8")

# 1. Line Chart – sepal length trend across dataset
plt.figure(figsize=(8,5))
plt.plot(df.index, df["sepal length (cm)"], label="Sepal Length", color="blue")
plt.title("Line Chart: Sepal Length Trend")
plt.xlabel("Index")
plt.ylabel("Sepal Length (cm)")
plt.legend()
plt.show()

# 2. Bar Chart – average petal length per species
plt.figure(figsize=(6,4))
df.groupby("species")["petal length (cm)"].mean().plot(kind="bar", color=["blue","green","orange"])
plt.title("Bar Chart: Average Petal Length per Species")
plt.xlabel("Species")
plt.ylabel("Average Petal Length (cm)")
plt.show()

# 3. Histogram – distribution of sepal width
plt.figure(figsize=(6,4))
plt.hist(df["sepal width (cm)"], bins=15, color="purple", edgecolor="black")
plt.title("Histogram: Sepal Width Distribution")
plt.xlabel("Sepal Width (cm)")
plt.ylabel("Frequency")
plt.show()

# 4. Scatter Plot – sepal length vs. petal length
plt.figure(figsize=(7,5))
sns.scatterplot(x="sepal length (cm)", y="petal length (cm)", hue="species", data=df, palette="Set1")
plt.title("Scatter Plot: Sepal Length vs. Petal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Petal Length (cm)")
plt.legend(title="Species")
plt.show()
