<a href="https://colab.research.google.com/github/Arunprasad05/Arunprasad05/blob/main/Data_Analysis_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%pip install pandas openpyxl matplotlib seaborn scikit-learn gradio

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import gradio as gr
import io



Now, let's create a function to handle file uploads and load the data into a pandas DataFrame. We'll include basic support for CSV and Excel files.

In [3]:
def load_data(file):
    """
    Handles file upload and loads data into a pandas DataFrame.
    Supports CSV and Excel formats.

    Args:
        file: The uploaded file object from Gradio.

    Returns:
        A pandas DataFrame containing the data, or None if the file format is not supported or an error occurs.
    """
    if file is None:
        return None

    file_path = file.name

    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_path)
        else:
            return None # Unsupported file format

        return df

    except Exception as e:
        print(f"Error loading file: {e}")
        return None

Let's create a function for basic data cleaning, including handling missing values and duplicates.

In [4]:
def clean_data(df):
    """
    Performs basic data cleaning: handles missing values (fills with median for numeric, mode for categorical)
    and removes duplicate rows.

    Args:
        df: The input pandas DataFrame.

    Returns:
        A cleaned pandas DataFrame.
    """
    if df is None:
        return None

    # Handle missing values
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype in ['int64', 'float64']:
                # Fill numeric missing values with median
                median_val = df[col].median()
                df[col].fillna(median_val, inplace=True)
            else:
                # Fill categorical missing values with mode
                mode_val = df[col].mode()[0] if not df[col].mode().empty else None
                if mode_val is not None:
                    df[col].fillna(mode_val, inplace=True)

    # Remove duplicate rows
    df.drop_duplicates(inplace=True)

    return df

Next, let's create a function to calculate descriptive statistics.

In [5]:
def get_descriptive_stats(df):
    """
    Calculates and returns descriptive statistics for the DataFrame.

    Args:
        df: The input pandas DataFrame.

    Returns:
        A pandas DataFrame with descriptive statistics.
    """
    if df is None:
        return None
    return df.describe(include='all')

Now, let's create a function for generating basic plots (histograms, scatter plots, box plots).

In [6]:
def generate_plots(df):
    """
    Generates basic plots (histograms for numeric, count plots for categorical,
    scatter plots for numeric pairs, and box plots for numeric).

    Args:
        df: The input pandas DataFrame.

    Returns:
        A list of plot figures.
    """
    if df is None:
        return []

    plots = []
    numeric_cols = df.select_dtypes(include=np.number).columns
    categorical_cols = df.select_dtypes(include='object').columns

    # Histograms for numeric columns
    for col in numeric_cols:
        fig, ax = plt.subplots()
        sns.histplot(df[col], kde=True, ax=ax)
        ax.set_title(f'Histogram of {col}')
        plots.append(fig)

    # Count plots for categorical columns
    for col in categorical_cols:
        fig, ax = plt.subplots()
        sns.countplot(y=df[col], order=df[col].value_counts().index, ax=ax)
        ax.set_title(f'Count Plot of {col}')
        plots.append(fig)


    # Scatter plots for pairs of numeric columns
    if len(numeric_cols) > 1:
        for i in range(len(numeric_cols)):
            for j in range(i + 1, len(numeric_cols)):
                col1 = numeric_cols[i]
                col2 = numeric_cols[j]
                fig, ax = plt.subplots()
                sns.scatterplot(x=df[col1], y=df[col2], ax=ax)
                ax.set_title(f'Scatter Plot of {col1} vs {col2}')
                plots.append(fig)

    # Box plots for numeric columns
    for col in numeric_cols:
        fig, ax = plt.subplots()
        sns.boxplot(y=df[col], ax=ax)
        ax.set_title(f'Box Plot of {col}')
        plots.append(fig)


    return plots

Finally, let's create a function for simple correlation analysis.

In [7]:
def analyze_correlation(df):
    """
    Calculates and visualizes the correlation matrix for numeric columns.

    Args:
        df: The input pandas DataFrame.

    Returns:
        A tuple containing the correlation matrix (pandas DataFrame) and the heatmap figure.
    """
    if df is None:
        return None, None

    numeric_df = df.select_dtypes(include=np.number)
    if numeric_df.empty:
        return "No numeric columns for correlation analysis.", None

    correlation_matrix = numeric_df.corr()

    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax)
    ax.set_title('Correlation Matrix')

    return correlation_matrix, fig

Let's create the Gradio interface. We'll include components for file upload and buttons for each analysis task.

In [15]:
def analyze_data(file):
    """
    Loads, cleans, and performs all analysis tasks on the uploaded file.

    Args:
        file: The uploaded file object from Gradio.

    Returns:
        A tuple containing a status message, the cleaned DataFrame, cleaned data info, descriptive statistics,
        a list of plot figures, and the correlation matrix and heatmap.
    """
    df = load_data(file)
    if df is None:
        return "Error loading data or unsupported file format.", None, None, None, [], None, None

    try:
        cleaned_df = clean_data(df.copy()) # Create a copy to avoid modifying the original df
        cleaned_data_info = f"Shape of cleaned data: {cleaned_df.shape[0]} rows, {cleaned_df.shape[1]} columns"
    except Exception as e:
        return f"Error during data cleaning: {e}", None, None, None, [], None, None

    try:
        descriptive_stats = get_descriptive_stats(cleaned_df)
        # Convert index to a column for clearer display in Gradio
        if isinstance(descriptive_stats, pd.DataFrame):
            descriptive_stats = descriptive_stats.reset_index().rename(columns={'index': 'Statistic'})
    except Exception as e:
        return f"Error getting descriptive statistics: {e}", cleaned_df, cleaned_data_info, None, [], None, None

    try:
        plots = generate_plots(cleaned_df)
    except Exception as e:
        return f"Error generating plots: {e}", cleaned_df, cleaned_data_info, descriptive_stats, [], None, None

    try:
        correlation_matrix, correlation_heatmap = analyze_correlation(cleaned_df)
    except Exception as e:
         return f"Error analyzing correlation: {e}", cleaned_df, cleaned_data_info, descriptive_stats, plots, None, None


    # Pad the plots list with None if fewer than 10 plots are generated
    padded_plots = plots + [None] * (10 - len(plots)) if len(plots) < 10 else plots[:10]


    return "Analysis Complete", cleaned_df, cleaned_data_info, descriptive_stats, *padded_plots, correlation_matrix, correlation_heatmap

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Interactive Data Analysis Agent")
    file_input = gr.File(label="Upload your data file (CSV or Excel)")
    analyze_button = gr.Button("Analyze Data")
    status_output = gr.Textbox(label="Status")


    with gr.Tabs():
        with gr.TabItem("Cleaned Data"):
            cleaned_data_info_output = gr.Textbox(label="Cleaned Data Information")
            cleaned_data_output = gr.DataFrame()
        with gr.TabItem("Descriptive Statistics"):
            descriptive_stats_output = gr.DataFrame()
        with gr.TabItem("Plots"):
            plot_outputs = [gr.Plot() for _ in range(10)] # Placeholder for multiple plots
        with gr.TabItem("Correlation Analysis"):
            correlation_matrix_output = gr.DataFrame()
            correlation_heatmap_output = gr.Plot()

    analyze_button.click(
        analyze_data,
        inputs=file_input,
        outputs=[status_output, cleaned_data_output, cleaned_data_info_output, descriptive_stats_output, *plot_outputs, correlation_matrix_output, correlation_heatmap_output]
    )

# To run the interface, uncomment the following line:
demo.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://651a50f6a17c9e6d80.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




**Note:** To run the Gradio interface, you'll need to uncomment `demo.launch()` in the code cell above and execute it.