# Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Initialize notebook mode
init_notebook_mode(connected=True)


# Read the Iris.csv file
df = pd.read_csv('C:/Users/Win10/Desktop/Iris.csv')

# Print the first few lines of the file
print(df.head())


   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [2]:
pip install plotly




In [3]:
import warnings
warnings.filterwarnings('ignore')


In [4]:
df.rename(columns={
    'SepalLengthCm': 'Sepal_Length',
    'SepalWidthCm': 'Sepal_Width',
    'PetalLengthCm': 'Petal_Length',
    'PetalWidthCm': 'Petal_Width',
    'Species': 'Species'
}, inplace=True)

In [5]:
df

Unnamed: 0,Id,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
df.shape

(150, 6)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            150 non-null    int64  
 1   Sepal_Length  150 non-null    float64
 2   Sepal_Width   150 non-null    float64
 3   Petal_Length  150 non-null    float64
 4   Petal_Width   150 non-null    float64
 5   Species       150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


# CLEANING THE DATA

In [8]:
print("\nMissing values in each column:")
print(df.isnull().sum())



Missing values in each column:
Id              0
Sepal_Length    0
Sepal_Width     0
Petal_Length    0
Petal_Width     0
Species         0
dtype: int64


In [9]:
print("\nNumber of duplicate rows:")
print(df.duplicated().sum())


Number of duplicate rows:
0


In [10]:
print("\nSummary statistics:")
print(df.describe())


Summary statistics:
               Id  Sepal_Length  Sepal_Width  Petal_Length  Petal_Width
count  150.000000    150.000000   150.000000    150.000000   150.000000
mean    75.500000      5.843333     3.054000      3.758667     1.198667
std     43.445368      0.828066     0.433594      1.764420     0.763161
min      1.000000      4.300000     2.000000      1.000000     0.100000
25%     38.250000      5.100000     2.800000      1.600000     0.300000
50%     75.500000      5.800000     3.000000      4.350000     1.300000
75%    112.750000      6.400000     3.300000      5.100000     1.800000
max    150.000000      7.900000     4.400000      6.900000     2.500000


# Create Histograms

In [16]:
trace1 = go.Histogram(x=df['Sepal_Length'], nbinsx=20, name='Sepal Length', marker=dict(color='skyblue'))
trace2 = go.Histogram(x=df['Sepal_Width'], nbinsx=20, name='Sepal Width', marker=dict(color='lightgreen'))
trace3 = go.Histogram(x=df['Petal_Length'], nbinsx=20, name='Petal Length', marker=dict(color='lightcoral'))
trace4 = go.Histogram(x=df['Petal_Width'], nbinsx=20, name='Petal Width', marker=dict(color='lightseagreen'))

fig_histograms = go.Figure(data=[trace1, trace2, trace3, trace4])

fig_histograms.update_layout(
    title='Histograms of Iris Dataset Features',
    barmode='overlay',
    xaxis_title='Value',
    yaxis_title='Frequency',
    bargap=0.2,
    bargroupgap=0.1
)

# Display Histograms using iplot
iplot(fig_histograms)

# Histogram of Sepal Length

In [18]:
# Calculate frequency of each species
species_counts = df['d'].value_counts()

trace_bar = go.Bar(
    x=species_counts.index,
    y=species_counts.values,
    marker=dict(color='skyblue'),
    text=species_counts.values,
    textposition='auto'
)

layout_bar = go.Layout(
    title='Frequency of Each Species',
    xaxis=dict(title='Species'),
    yaxis=dict(title='Frequency')
)

fig_bar = go.Figure(data=[trace_bar], layout=layout_bar)

# Display Bar Graph using iplot
iplot(fig_bar)

KeyError: 'd'

# Histogram of Sepal Width

In [None]:

trace = go.Histogram(
    x=df['SepalWidthCm'],
    nbinsx=20,
    name='Sepal Width',
    marker=dict(color='lightgreen')
)

layout = go.Layout(
    title='Histogram of Sepal Width',
    xaxis=dict(title='Sepal Width (cm)'),
    yaxis=dict(title='Frequency')
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

# Bar Graph of Species Count

In [17]:
species_counts = df['Species'].value_counts()

trace_bar = go.Bar(
    x=species_counts.index,
    y=species_counts.values,
    marker=dict(color=['skyblue']),
    text=species_counts.values,
    textposition='auto'
)

layout_bar = go.Layout(
    title='Count of Each Species',
    xaxis=dict(title='Species'),
    yaxis=dict(title='Count')
)

fig_bar = go.Figure(data=[trace_bar], layout=layout_bar)

# Display Bar Graph using iplot
iplot(fig_bar)

# Calculate mean Sepal Length by Species

In [None]:

mean_sepal_length = df.groupby('Species')['SepalLengthCm'].mean()

trace = go.Bar(
    x=mean_sepal_length.index,
    y=mean_sepal_length.values,
    marker=dict(color='lightcoral'),
    text=mean_sepal_length.values.round(2),
    textposition='auto'
)

layout = go.Layout(
    title='Mean Sepal Length by Species',
    xaxis=dict(title='Species'),
    yaxis=dict(title='Mean Sepal Length (cm)')
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

# Calculate mean Sepal Width by Species

In [None]:

mean_sepal_width = df.groupby('Species')['SepalWidthCm'].mean()

trace = go.Bar(
    x=mean_sepal_width.index,
    y=mean_sepal_width.values,
    marker=dict(color='lightseagreen'),
    text=mean_sepal_width.values.round(2),
    textposition='auto'
)

layout = go.Layout(
    title='Mean Sepal Width by Species',
    xaxis=dict(title='Species'),
    yaxis=dict(title='Mean Sepal Width (cm)')
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

## Observations Based on Visualizations

### Histogram Observations

1. **Sepal Length Histogram**:
   - The sepal length of the samples in the dataset generally falls between 4.5 cm and 5.1 cm.
   - Most samples have a sepal length close to 5.0 cm, indicating a slight skew towards the higher end.

2. **Sepal Width Histogram**:
   - The sepal width of the samples ranges from approximately 3.0 cm to 3.6 cm.
   - There is a peak around 3.5 cm, suggesting that most samples have a relatively wider sepal width.

### Bar Graph Observations

1. **Species Count**:
   - All samples in the provided dataset are of the species "Iris-setosa."
   - This observation suggests that the provided subset of the dataset is not diverse in terms of species variety.

2. **Mean Sepal Length by Species**:
   - For "Iris-setosa," the mean sepal length is around 4.86 cm.
   - This indicates a consistency in the sepal length measurements for this species.

3. **Mean Sepal Width by Species**:
   - For "Iris-setosa," the mean sepal width is around 3.28 cm.
   - This suggests that the sepal width is also quite consistent for this species.

### Summary

The provided dataset subset mainly consists of "Iris-setosa" species. The sepal length and width measurements are fairly consistent, with a slight skew towards higher sepal lengths. The histograms provide a good visual indication of the distribution of these measurements, while the bar graphs summarize the mean values, highlighting the uniformity within this subset of the dataset. For a more comprehensive analysis, it would be beneficial to include samples from other species as well.
