In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the Dataset
**Loading the Iris Dataset**

In [None]:
df=pd.read_csv("/kaggle/input/iris-flower-dataset/IRIS.csv")

In [None]:
df.head()

# EDA - Exploratory Data Analysis

## Let's check the basic summary of the dataset

In [None]:
def describe_data(df):
    """
    Provides descriptive statistics for the entire DataFrame.
    
    Parameters:
    df (DataFrame): The DataFrame.
    
    Returns:
    DataFrame: Descriptive statistics of the DataFrame.
    """
    description = df.describe()
    print("Descriptive statistics of the DataFrame:\n", description)
    return description

In [None]:
description=describe_data(df)

## Let's check how much missing values are there in the dataset

In [None]:
def missing_value_percentage(df):
    """
    Calculates the percentage of missing values for each column in the DataFrame.
    
    Parameters:
    df (DataFrame): The DataFrame.
    
    Returns:
    DataFrame: DataFrame showing the percentage of missing values for each column.
    """
    missing_percentage = df.isnull().mean() * 100
    print(f"Missing value percentage per column:\n{missing_percentage}")
    return missing_percentage

In [None]:
missing_values_percentage=missing_value_percentage(df)

# Let's check the datatypes of each column

In [None]:
def check_data_types(df):
    """
    Checks and prints the data types of all columns in the DataFrame.
    
    Parameters:
    df (DataFrame): The DataFrame.
    
    Returns:
    Series: Data types of the columns.
    """
    dtypes = df.dtypes
    print("Data types of the columns:\n", dtypes)
    return dtypes

In [None]:
dtypes=check_data_types(df)

## Let's check the distribution of the categories in the target_feature `species`

In [None]:
def class_distribution(df, target_column):
    """
    Computes and prints the class distribution of the target column.
    
    Parameters:
    df (DataFrame): The DataFrame.
    target_column (str): The target column to analyze.
    
    Returns:
    Series: Class distribution.
    """
    distribution = df[target_column].value_counts(normalize=True)
    print(f"Class distribution for {target_column}:\n", distribution)
    return distribution

In [None]:
distribution=class_distribution(df, "species")

## Let's see if the dataset has any outliers or not

In [None]:
def find_outliers_with_iqr(df, column):
    """
    Identifies outliers in the specified column using the IQR method.
    
    Parameters:
    df (DataFrame): The DataFrame.
    column (str): The column to check for outliers.
    
    Returns:
    DataFrame: DataFrame containing the outliers.
    """
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))]
    print(f"Outliers in {column} identified using IQR method:\n", outliers)
    return outliers

In [None]:
columns=["sepal_length", "sepal_width",  "petal_length",  "petal_width"]

In [None]:
for col in columns:
    find_outliers_with_iqr(df, col)

### Delete the rows with outliers

In [None]:
def handle_outliers(df, column, method='IQR'):
    """
    Handles outliers in the specified column using the chosen method.
    
    Parameters:
    df (DataFrame): The DataFrame.
    column (str): The column to check for outliers.
    method (str): The method to handle outliers ('IQR' or 'Z-score').
    
    Returns:
    DataFrame: DataFrame with handled outliers.
    """
    if method == 'IQR':
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        df_cleaned = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
        print(f"Outliers in {column} handled using IQR method.")
    elif method == 'Z-score':
        from scipy import stats
        df_cleaned = df[(abs(stats.zscore(df[column])) < 3)]
        print(f"Outliers in {column} handled using Z-score method.")
    return df_cleaned

In [None]:
df_new=handle_outliers(df, "sepal_width", method='IQR')

In [None]:
disribution_new=class_distribution(df_new, "species")

# Let's check if there are any duplicate rows in the dataset

In [None]:
def detect_duplicates(df):
    """
    Detects and returns duplicate rows in the DataFrame.
    
    Parameters:
    df (DataFrame): The DataFrame.
    
    Returns:
    DataFrame: DataFrame with duplicates.
    """
    duplicates = df[df.duplicated()]
    print(f"Detected {duplicates.shape[0]} duplicate rows.")
    return duplicates

In [None]:
duplicates=detect_duplicates(df)

In [None]:
duplicates.index

### Let's delete the duplicate rows

In [None]:
df_new = df_new[~df_new.index.duplicated(keep='first')]

In [None]:
distribution3=class_distribution(df_new, "species")

## Let's delete the rows with missing values if any

In [None]:
def drop_na(df):
    """
    Drop rows with missing values.
    
    Parameters:
    df (DataFrame): The DataFrame to modify.
    
    Returns:
    DataFrame: The DataFrame with rows containing NaN values removed.
    """
    na_dropped_df = df.dropna()
    print(f"Dropped rows with missing values. Rows before: {df.shape[0]}, Rows after: {na_dropped_df.shape[0]}.")
    return na_dropped_df

In [None]:
dropped=drop_na(df_new)

## Let's see how different features are correlated with each other

In [None]:
def correlation_matrix(df):
    """
    Computes and prints the correlation matrix of the DataFrame.
    
    Parameters:
    df (DataFrame): The input DataFrame.
    
    Returns:
    DataFrame: Correlation matrix.
    """
    corr_matrix = df.corr()
    print("Correlation Matrix:\n", corr_matrix)
    return corr_matrix

In [None]:
x=df_new[columns]
y=df_new["species"]

In [None]:
matrix=correlation_matrix(x)

In [None]:
def heatmap(df):
    """
    Creates a heatmap for the DataFrame.
    
    Parameters:
    df (DataFrame): The DataFrame to visualize.
    
    Returns:
    None
    """
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title("Heatmap of Correlations")
    plt.show()
    print("Heatmap created.")

In [None]:
heatmap(x)

## Let's explore every feature individually

In [None]:
def univariate_analysis(df, column):
    """
    Performs univariate analysis on a specified column and prints summary statistics.
    
    Parameters:
    df (DataFrame): The DataFrame.
    column (str): The column for univariate analysis.
    
    Returns:
    None
    """
    print(f"Univariate Analysis of {column}:")
    print(df[column].describe())
    df[column].hist()

In [None]:
univariate_analysis(x,columns[0])

In [None]:
univariate_analysis(x,columns[1])

In [None]:
univariate_analysis(x,columns[2])

In [None]:
univariate_analysis(x,columns[3])

## Let's explore the target_variable `species` individually

In [None]:
def plot_categorical(df, column):
    """
    Plots the value counts of a categorical column.
    
    Parameters:
    df (DataFrame): The DataFrame.
    column (str): The categorical column to plot.
    
    Returns:
    None
    """
    df[column].value_counts().plot(kind='bar')
    print(f"Value counts plotted for {column}.")

In [None]:
y=pd.DataFrame(data=y,columns=["species"])

In [None]:
plot_categorical(y, "species")

## Let's check all the features in x

In [None]:
def pairplot(df):
    """
    Creates pair plots for the numerical columns in the DataFrame.
    
    Parameters:
    df (DataFrame): The DataFrame.
    
    Returns:
    None
    """
    import seaborn as sns
    
    sns.pairplot(df)
    print("Pair plots for numerical columns created.")

In [None]:
pairplot(x)

# Feature Engineering

## Let's encode the target_variable `species`

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encoding(df, column):
    """
    Encode categorical column labels as numeric values.
    
    Parameters:
    df (DataFrame): The DataFrame to modify.
    column (str): Name of the categorical column to encode.
    
    Returns:
    DataFrame: DataFrame with label-encoded column.
    """
    encoder = LabelEncoder()
    df[column] = encoder.fit_transform(df[column])
    print(f"Label encoding applied to column: {column}.")
    return df

In [None]:
encoded_y=label_encoding(y, "species")

In [None]:
y

# Model Training

## Let's train the models on the datasets

In [None]:
def train_classification_models(X_train, y_train, X_test, y_test):
    """
    Trains several classification models and evaluates their performance.

    Parameters:
    X_train (array): Training features.
    y_train (array): Training target.
    X_test (array): Test features.
    y_test (array): Test target.

    Returns:
    dict: Dictionary with model names and their accuracy scores.
    """
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score

    models = {
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'Support Vector Machine': SVC()
    }

    scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        scores[name] = accuracy
        print(f"{name}: Accuracy = {accuracy:.2f}")
    
    return scores

In [None]:
146/4

In [None]:
def split_dataframe(df, test_size=0.2, random_state=None):
    """
    Splits a DataFrame into training and testing sets.

    Parameters:
    - df: The DataFrame to split.
    - test_size: Proportion of the dataset to include in the test split (default is 0.2).
    - random_state: Seed for the random number generator (default is None).

    Returns:
    - X_train: Training features.
    - X_test: Testing features.
    - y_train: Training target.
    - y_test: Testing target.
    """
    # Assuming the last column is the target variable
    from sklearn.model_selection import train_test_split
    X = df.iloc[:, :-1]  # Features
    y = df.iloc[:, -1]   # Target

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [None]:
x["species"]=y
x.head()

In [None]:
xtrain, xtest, ytrain, ytest=split_dataframe(x,0.3,22)

In [None]:
train_classification_models(xtrain, ytrain, xtest, ytest)

# Results: The Best Model
The best model is **Random Forest** with accuracy = 0.98