In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from autoviz.AutoViz_Class import AutoViz_Class  

def load_data(csv_file):    
    try:
        df = pd.read_csv(csv_file, dtype=str)
        return df
    except FileNotFoundError:
        print(f"⚠️ Error: File '{csv_file}' not found.")
        return None

def clean_price(value):
    """Converts prices like '1,529' to 1.529 (float)."""
    if isinstance(value, str):
        return float(value.replace(",", ".")) if value.replace(",", "").replace(".", "").isdigit() else None
    return None

def filter_by_province(df, target_province):
    """Filters the DataFrame by a specific province."""
    if "Provincia" not in df.columns:
        print("Error: Column 'Provincia' does not exist in the dataset.")
        return None

    df_province = df[df["Provincia"].str.upper() == target_province].copy()

    if df_province.empty:
        print(f"No data available for province {target_province}.")

    return df_province


def clean_data(df):
    """Converts price columns to numeric values and drops NaN rows."""
    if df is None:
        return None

    price_columns = ["Precio gasolina 95 E5", "Precio gasóleo A"]

    for col in price_columns:
        if col in df.columns:
            df[col] = df[col].apply(clean_price)
        else:
            print(f"Warning: Column '{col}' does not exist in the dataset.")

    df.dropna(subset=price_columns, inplace=True)
    return df

def visualize_with_autoviz(csv_file):    
    AV = AutoViz_Class()  
    report = AV.AutoViz(csv_file)
    return report

def plot_prices(df, target_province):
    """Plots histograms of fuel prices for a given province."""
    if df is None or df.empty:
        print("Not enough data to plot.")
        return

    plt.figure(figsize=(10, 5))

    if "Precio gasolina 95 E5" in df.columns:
        plt.hist(df["Precio gasolina 95 E5"], bins=10, alpha=0.5, label="Gasoline 95 E5", color="red")

    if "Precio gasóleo A" in df.columns:
        plt.hist(df["Precio gasóleo A"], bins=10, alpha=0.5, label="Diesel A", color="blue")

    plt.xlabel("Price (€)")
    plt.ylabel("Number of stations")
    plt.title(f"Price distribution in {target_province}")
    plt.legend()
    plt.savefig(f"../images/price_distribution_{target_province}.png")
    #plt.show()

def main():
    csv_file = "../data/prix_carburants.csv"
    target_province = "MADRID"

    df = load_data(csv_file)
    df_province = filter_by_province(df, target_province)
    df_province = clean_data(df_province)
    #df_province = prepare_data(df, target_province)

    if df_province is not None and not df_province.empty:
        print("\nGenerating automatic report with AutoViz...\n")
        visualize_with_autoviz(csv_file)
        plot_prices(df_province, target_province)

if __name__ == "__main__":
    main()



Imported v0.1.905. Please call AutoViz in this sequence:
    AV = AutoViz_Class()
    %matplotlib inline
    dfte = AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=1, lowess=False,
               chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)

Generating automatic report with AutoViz...

Shape of your Data Set loaded: (11865, 10)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  0
    Number of Integer-Categorical Columns =  1
    Number of String-Categorical Columns =  3
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  0
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  2
    

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
Provincia,object,0.0,0,,,13 rare categories: Too many to list. Group them into a single category or drop the categories.
Municipio,object,0.0,29,,,No issue
Localidad,object,0.0,36,,,No issue
Código postal,int64,0.0,38,1002.0,52006.0,No issue
Dirección,object,0.0,98,,,No issue
Precio gasolina 95 E5,object,9.903772,3,,,"1163 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: object, float,, Possible high cardinality column with 414 unique values: Use hash encoding or text embedding to reduce dimension."
Precio gasóleo A,object,3.125266,3,,,"367 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: object, float,, Possible high cardinality column with 438 unique values: Use hash encoding or text embedding to reduce dimension."
Tipo venta,object,0.0,0,,,1 rare categories: ['p']. Group them into a single category or drop the categories.
Tipo servicio,object,8.97556,0,,,"1054 missing values. Impute them with mean, median, mode, or a constant value such as 123., Mixed dtypes: has 2 different data types: object, float,"
Rótulo,object,0.0,34,,,No issue


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\SGalv\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\SGalv\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\SGalv\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\SGalv\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\SGalv\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

All Plots done
Time to run AutoViz = 34 seconds 

 ###################### AUTO VISUALIZATION Completed ########################
