In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")  # For better visuals

# -----------------------------
# Task 1: Load & Explore Dataset
# -----------------------------
try:
    df = pd.read_csv("dataset.csv")  # Replace with your own CSV file name
    print("✅ Dataset successfully loaded!")
except FileNotFoundError:
    from sklearn.datasets import load_iris
    iris = load_iris(as_frame=True)
    df = iris.frame
    print("⚠️ dataset.csv not found. Loaded Iris dataset instead.")

# Display first few rows
print("\n🔎 First 5 rows of the dataset:")
print(df.head())

# Explore dataset structure
print("\n📋 Dataset Info:")
print(df.info())

print("\n📊 Missing Values:")
print(df.isnull().sum())

# Handle missing values (drop if any)
if df.isnull().sum().sum() > 0:
    df = df.dropna()
    print("✅ Missing values dropped.")
else:
    print("✅ No missing values found.")

# -----------------------------
# Task 2: Basic Data Analysis
# -----------------------------
print("\n📈 Descriptive Statistics:")
print(df.describe())

if "target" in df.columns:
    group_col = "target"
elif "species" in df.columns:
    group_col = "species"
else:
    group_col = None

if group_col:
    print(f"\n📊 Mean of Numerical Columns by {group_col.capitalize()}:")
    print(df.groupby(group_col).mean(numeric_only=True))

# -----------------------------
# Task 3: Data Visualization
# -----------------------------

# 1️⃣ Line Chart
plt.figure(figsize=(8, 4))
df[df.columns[0]].plot(kind="line")
plt.title("Line Chart of First Numerical Column")
plt.xlabel("Index")
plt.ylabel(df.columns[0])
plt.show()

# 2️⃣ Bar Chart
if group_col:
    plt.figure(figsize=(6, 4))
    df.groupby(group_col)[df.select_dtypes("number").columns[0]].mean().plot(kind="bar")
    plt.title(f"Average {df.select_dtypes('number').columns[0]} by {group_col.capitalize()}")
    plt.xlabel(group_col.capitalize())
    plt.ylabel(f"Average {df.select_dtypes('number').columns[0]}")
    plt.show()

# 3️⃣ Histogram
plt.figure(figsize=(6, 4))
plt.hist(df[df.select_dtypes("number").columns[0]], bins=20)
plt.title(f"Histogram of {df.select_dtypes('number').columns[0]}")
plt.xlabel(df.select_dtypes("number").columns[0])
plt.ylabel("Frequency")
plt.show()

# 4️⃣ Scatter Plot
if len(df.select_dtypes("number").columns) >= 2:
    plt.figure(figsize=(6, 4))
    x_col = df.select_dtypes("number").columns[0]
    y_col = df.select_dtypes("number").columns[1]
    plt.scatter(df[x_col], df[y_col])
    plt.title(f"Scatter Plot of {x_col} vs {y_col}")
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.show()

# -----------------------------
# Findings / Observations
# -----------------------------
print("""
📝 Findings & Observations:
1. Dataset contains {} rows and {} columns after cleaning.
2. Grouped analysis shows differences in numerical means across categories.
3. Histogram shows that {} is {} distributed.
4. Scatter plot shows a {} relationship between {} and {}.
""".format(
    df.shape[0],
    df.shape[1],
    x_col,
    "roughly normal" if df[x_col].skew() < 1 else "skewed",
    "positive" if df.corr().iloc[0, 1] > 0 else "negative",
    x_col,
    y_col,
))

