In [3]:
%pip install pandas seaborn matplotlib plotly statsmodels jinja2

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
from io import BytesIO
import base64
from jinja2 import Template

In [5]:
def plot_to_base64(plt):
    img = BytesIO()
    plt.savefig(img, format='png')
    img.seek(0)
    return base64.b64encode(img.getvalue()).decode('utf-8')

In [11]:
# Função principal com HTML estilizado e descrições explicativas
def analyze_dataframe(df, output_file="report.html"):
    # Configuração inicial do seaborn
    sns.set(style="whitegrid")

    # Estatísticas descritivas
    stats = df.describe().T
    
    # Selecionar apenas colunas numéricas para a correlação
    numerical_df = df.select_dtypes(include=['float64', 'int64'])
    
    # Gráfico de correlações
    if not numerical_df.empty:
        corr = numerical_df.corr()
        plt.figure(figsize=(10, 6))
        sns.heatmap(corr, annot=True, cmap='coolwarm', linewidths=0.5)
        plt.title('Correlation Heatmap')
        corr_img = plot_to_base64(plt)
        plt.close()
    else:
        corr_img = None
    
    # Histogramas para cada coluna
    histograms = {}
    for column in numerical_df.columns:
        plt.figure(figsize=(6, 4))
        sns.histplot(df[column], kde=True)
        plt.title(f'Histogram of {column}')
        histograms[column] = plot_to_base64(plt)
        plt.close()
    
    # Scatter plots para visualização de relações
    scatter_plots = {}
    numerical_columns = numerical_df.columns
    for i, col1 in enumerate(numerical_columns):
        for col2 in numerical_columns[i+1:]:
            plt.figure(figsize=(6, 4))
            sns.scatterplot(x=col1, y=col2, data=df)
            plt.title(f'Scatter plot between {col1} and {col2}')
            scatter_plots[f'{col1}_vs_{col2}'] = plot_to_base64(plt)
            plt.close()

    # Regressão linear com tratamento para valores ausentes
    regression_plots = {}
    for col1 in numerical_columns:
        for col2 in numerical_columns:
            if col1 != col2:
                # Remover valores ausentes das colunas col1 e col2
                subset_df = df[[col1, col2]].dropna()

                # Continuar apenas se houver dados suficientes após remover os NaNs
                if not subset_df.empty:
                    X = sm.add_constant(subset_df[col1])
                    Y = subset_df[col2]
                    model = sm.OLS(Y, X).fit()

                    plt.figure(figsize=(6, 4))
                    sns.regplot(x=col1, y=col2, data=subset_df, line_kws={'color': 'red'})
                    plt.title(f'Regression: {col1} vs {col2}')
                    regression_plots[f'{col1}_reg_{col2}'] = plot_to_base64(plt)
                    plt.close()

    # Template HTML para relatório
    template = """
    <html>
    <head>
        <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
        <title>Data Analysis Report</title>
        <style>
            body {
                padding-top: 20px;
            }
            h1, h2 {
                text-align: center;
                margin-bottom: 30px;
            }
            .section {
                margin-bottom: 50px;
            }
            .img-fluid {
                margin: 20px 0;
            }
        </style>
    </head>
    <body>
        <div class="container">
            <h1 class="mt-5">Automated Data Analysis Report</h1>
            <p class="text-center">This report provides a detailed visual and statistical analysis of the given dataset, using various techniques like correlation heatmaps, histograms, scatter plots, and linear regressions.</p>
            
            <div class="section">
                <h2>1. Correlation Heatmap</h2>
                <p>This heatmap shows the correlation between numerical columns in the dataset. A value close to 1 indicates a strong positive correlation, while a value close to -1 indicates a strong negative correlation. A value close to 0 means no correlation.</p>
                {% if corr_img %}
                <img src="data:image/png;base64,{{ corr_img }}" class="img-fluid" alt="Correlation Heatmap">
                {% else %}
                <p>No numerical data available for correlation analysis.</p>
                {% endif %}
            </div>
            
            <div class="section">
                <h2>2. Descriptive Statistics</h2>
                <p>The table below summarizes the basic statistics for each numerical column, including the count, mean, standard deviation, and the minimum and maximum values.</p>
                <table class="table table-bordered">
                    <thead>
                        <tr>
                            <th>Column</th>
                            <th>Count</th>
                            <th>Mean</th>
                            <th>Std</th>
                            <th>Min</th>
                            <th>25%</th>
                            <th>50%</th>
                            <th>75%</th>
                            <th>Max</th>
                        </tr>
                    </thead>
                    <tbody>
                        {% for col, stats in stats.iterrows() %}
                        <tr>
                            <td>{{ col }}</td>
                            <td>{{ stats['count'] }}</td>
                            <td>{{ stats['mean'] }}</td>
                            <td>{{ stats['std'] }}</td>
                            <td>{{ stats['min'] }}</td>
                            <td>{{ stats['25%'] }}</td>
                            <td>{{ stats['50%'] }}</td>
                            <td>{{ stats['75%'] }}</td>
                            <td>{{ stats['max'] }}</td>
                        </tr>
                        {% endfor %}
                    </tbody>
                </table>
            </div>

            <div class="section">
                <h2>3. Histograms</h2>
                <p>Histograms provide an overview of the distribution of values for each numerical column. The density curve helps visualize the probability distribution of the data.</p>
                {% for col, img in histograms.items() %}
                <h3>{{ col }}</h3>
                <img src="data:image/png;base64,{{ img }}" class="img-fluid" alt="Histogram for {{ col }}">
                {% endfor %}
            </div>

            <div class="section">
                <h2>4. Scatter Plots</h2>
                <p>Scatter plots show the relationship between two numerical variables. A pattern can help identify correlations, trends, or outliers.</p>
                {% for title, img in scatter_plots.items() %}
                <h3>{{ title }}</h3>
                <img src="data:image/png;base64,{{ img }}" class="img-fluid" alt="{{ title }}">
                {% endfor %}
            </div>

            <div class="section">
                <h2>5. Regression Analysis</h2>
                <p>Linear regression attempts to model the relationship between two numerical variables by fitting a line through the data points. The red line in the graphs represents the linear fit.</p>
                {% for title, img in regression_plots.items() %}
                <h3>{{ title }}</h3>
                <img src="data:image/png;base64,{{ img }}" class="img-fluid" alt="Regression plot for {{ title }}">
                {% endfor %}
            </div>

        </div>
    </body>
    </html>
    """
    
    # Preparar o conteúdo HTML
    html_content = Template(template).render(
        corr_img=corr_img, 
        stats=stats, 
        histograms=histograms, 
        scatter_plots=scatter_plots, 
        regression_plots=regression_plots
    )
    
    # Salvar o relatório em HTML
    with open(output_file, "w") as f:
        f.write(html_content)
    print(f"Report saved to {output_file}")


In [13]:
# Carregar dataset do Kaggle
df = pd.read_csv('apple_quality.csv')

# Rodar a análise e gerar o relatório HTML
analyze_dataframe(df, output_file="apple_quality_report.html")


Report saved to apple_quality_report.html
