In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Task 1: Load and Explore the Dataset
# Load the Iris dataset
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
df = pd.read_csv(url)

# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Check data types and missing values
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())

# Since Iris dataset is clean, no need to handle missing values
# Confirm dataset shape
print("\nDataset Shape:", df.shape)

# Task 2: Basic Data Analysis
# Basic statistics
print("\nBasic Statistics:")
print(df.describe())

# Group by species and compute mean for numerical columns
species_means = df.groupby('species').mean()
print("\nMean values by Species:")
print(species_means)

# Observations
print("\nObservations:")
print("1. Setosa has the smallest petal length and width on average.")
print("2. Virginica has the largest sepal and petal measurements.")
print("3. Versicolor has intermediate measurements between Setosa and Virginica.")

# Task 3: Data Visualization
# Set style for better visuals
plt.style.use('seaborn')

# 1. Line chart: Mean measurements per species
plt.figure(figsize=(10, 6))
for column in df.columns[:-1]:
    plt.plot(species_means.index, species_means[column], marker='o', label=column)
plt.title('Mean Measurements by Iris Species')
plt.xlabel('Species')
plt.ylabel('Measurement (cm)')
plt.legend()
plt.grid(True)
plt.savefig('line_chart.png')
plt.close()

# 2. Bar chart: Average petal length per species
plt.figure(figsize=(8, 6))
sns.barplot(x='species', y='petal_length', data=df)
plt.title('Average Petal Length by Species')
plt.xlabel('Species')
plt.ylabel('Petal Length (cm)')
plt.savefig('bar_chart.png')
plt.close()

# 3. Histogram: Sepal length distribution
plt.figure(figsize=(8, 6))
plt.hist(df['sepal_length'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Sepal Length')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Frequency')
plt.savefig('histogram.png')
plt.close()

# 4. Scatter plot: Sepal length vs Petal length
plt.figure(figsize=(8, 6))
sns.scatterplot(x='sepal_length', y='petal_length', hue='species', size='species', data=df)
plt.title('Sepal Length vs Petal Length')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal Length (cm)')
plt.legend(title='Species')
plt.savefig('scatter_plot.png')
plt.close()

print("\nVisualizations saved as: line_chart.png, bar_chart.png, histogram.png, scatter_plot.png")

Matplotlib is building the font cache; this may take a moment.


First 5 rows of the dataset:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

Data Types:
sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

Missing Values:
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

Dataset Shape: (150, 5)

Basic Statistics:
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000    

  plt.style.use('seaborn')



Visualizations saved as: line_chart.png, bar_chart.png, histogram.png, scatter_plot.png
