In [1]:
%pip install pandas seaborn matplotlib plotly statsmodels jinja2

Defaulting to user installation because normal site-packages is not writeable
Collecting plotly
  Downloading plotly-5.24.1-py3-none-any.whl.metadata (7.3 kB)
Collecting statsmodels
  Downloading statsmodels-0.14.4-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Collecting jinja2
  Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting tenacity>=6.2.0 (from plotly)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting MarkupSafe>=2.0 (from jinja2)
  Downloading MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Downloading plotly-5.24.1-py3-none-any.whl (19.1 MB)
   ---------------------------------------- 0.0/19.1 MB ? eta -:--:--
   ---- ----------------------------------- 2.4/19.1 MB 12.2 MB/s eta 0:00:02
   --------------------------- ------------ 13.1/19.1 MB 32.9 MB/s eta 0:00:01
   ---------------------------------------- 19.1/19.

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
from io import BytesIO
import base64
from jinja2 import Template

In [8]:
def plot_to_base64(plt):
    img = BytesIO()
    plt.savefig(img, format='png')
    img.seek(0)
    return base64.b64encode(img.getvalue()).decode('utf-8')

In [18]:
# Função principal ajustada para lidar com valores ausentes
def analyze_dataframe(df, output_file="report.html"):
    # Configuração inicial do seaborn
    sns.set(style="whitegrid")

    # Estatísticas descritivas
    stats = df.describe().T
    
    # Selecionar apenas colunas numéricas para a correlação
    numerical_df = df.select_dtypes(include=['float64', 'int64'])
    
    # Gráfico de correlações
    if not numerical_df.empty:
        corr = numerical_df.corr()
        plt.figure(figsize=(10, 6))
        sns.heatmap(corr, annot=True, cmap='coolwarm', linewidths=0.5)
        plt.title('Correlation Heatmap')
        corr_img = plot_to_base64(plt)
        plt.close()
    else:
        corr_img = None
    
    # Histogramas para cada coluna
    histograms = {}
    for column in numerical_df.columns:
        plt.figure(figsize=(6, 4))
        sns.histplot(df[column], kde=True)
        plt.title(f'Histogram of {column}')
        histograms[column] = plot_to_base64(plt)
        plt.close()
    
    # Scatter plots para visualização de relações
    scatter_plots = {}
    numerical_columns = numerical_df.columns
    for i, col1 in enumerate(numerical_columns):
        for col2 in numerical_columns[i+1:]:
            plt.figure(figsize=(6, 4))
            sns.scatterplot(x=col1, y=col2, data=df)
            plt.title(f'Scatter plot between {col1} and {col2}')
            scatter_plots[f'{col1}_vs_{col2}'] = plot_to_base64(plt)
            plt.close()

    # Regressão linear com tratamento para valores ausentes
    regression_plots = {}
    for col1 in numerical_columns:
        for col2 in numerical_columns:
            if col1 != col2:
                # Remover valores ausentes das colunas col1 e col2
                subset_df = df[[col1, col2]].dropna()

                # Continuar apenas se houver dados suficientes após remover os NaNs
                if not subset_df.empty:
                    X = sm.add_constant(subset_df[col1])
                    Y = subset_df[col2]
                    model = sm.OLS(Y, X).fit()

                    plt.figure(figsize=(6, 4))
                    sns.regplot(x=col1, y=col2, data=subset_df, line_kws={'color': 'red'})
                    plt.title(f'Regression: {col1} vs {col2}')
                    regression_plots[f'{col1}_reg_{col2}'] = plot_to_base64(plt)
                    plt.close()

    # Template HTML para relatório
    template = """
    <html>
    <head>
        <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
        <title>Data Analysis Report</title>
    </head>
    <body>
        <div class="container">
            <h1 class="mt-5">Data Analysis Report</h1>
            
            <h2 class="mt-4">Correlation Heatmap</h2>
            {% if corr_img %}
            <img src="data:image/png;base64,{{ corr_img }}" class="img-fluid">
            {% else %}
            <p>No numerical data available for correlation.</p>
            {% endif %}
            
            <h2 class="mt-4">Descriptive Statistics</h2>
            <table class="table table-bordered">
                <thead>
                    <tr>
                        <th>Column</th>
                        <th>Count</th>
                        <th>Mean</th>
                        <th>Std</th>
                        <th>Min</th>
                        <th>25%</th>
                        <th>50%</th>
                        <th>75%</th>
                        <th>Max</th>
                    </tr>
                </thead>
                <tbody>
                    {% for col, stats in stats.iterrows() %}
                    <tr>
                        <td>{{ col }}</td>
                        <td>{{ stats['count'] }}</td>
                        <td>{{ stats['mean'] }}</td>
                        <td>{{ stats['std'] }}</td>
                        <td>{{ stats['min'] }}</td>
                        <td>{{ stats['25%'] }}</td>
                        <td>{{ stats['50%'] }}</td>
                        <td>{{ stats['75%'] }}</td>
                        <td>{{ stats['max'] }}</td>
                    </tr>
                    {% endfor %}
                </tbody>
            </table>

            <h2 class="mt-4">Histograms</h2>
            {% for col, img in histograms.items() %}
            <h3>{{ col }}</h3>
            <img src="data:image/png;base64,{{ img }}" class="img-fluid">
            {% endfor %}

            <h2 class="mt-4">Scatter Plots</h2>
            {% for title, img in scatter_plots.items() %}
            <h3>{{ title }}</h3>
            <img src="data:image/png;base64,{{ img }}" class="img-fluid">
            {% endfor %}

            <h2 class="mt-4">Regression Plots</h2>
            {% for title, img in regression_plots.items() %}
            <h3>{{ title }}</h3>
            <img src="data:image/png;base64,{{ img }}" class="img-fluid">
            {% endfor %}
        </div>
    </body>
    </html>
    """
    
    # Preparar o conteúdo HTML
    html_content = Template(template).render(
        corr_img=corr_img, 
        stats=stats, 
        histograms=histograms, 
        scatter_plots=scatter_plots, 
        regression_plots=regression_plots
    )
    
    # Salvar o relatório em HTML
    with open(output_file, "w") as f:
        f.write(html_content)
    print(f"Report saved to {output_file}")


In [19]:
# Carregar dataset do Kaggle
df = pd.read_csv('apple_quality.csv')

# Rodar a análise e gerar o relatório HTML
analyze_dataframe(df, output_file="apple_quality_report.html")


Report saved to apple_quality_report.html
