In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from functools import partial
import re
import os

# Mount Google Drive (specific to Google Colab)
from google.colab import drive
drive.mount('/content/drive')

# Main path
main_path = '/content/drive/MyDrive/Master Paper/Data/'
result_path = '/content/drive/MyDrive/Master Paper/Results/'

Mounted at /content/drive


In [3]:

def load_dataset(file_path):
    """
    Load a dataset from the given file path.

    :param file_path: Path to the dataset file.
    :return: Loaded pandas DataFrame.
    """
    if file_path.endswith('.xlsx'):
        return pd.read_excel(file_path)
    elif file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file format. Use .csv or .xlsx")

def clean_marke(marke, brand_mappings):
    """
    Cleans and standardizes brand names.
whic
    :param marke: Original brand name as a string.
    :param brand_mappings: Dictionary containing mappings for brand names.
    :return: Cleaned and standardized brand name as a string.
    """
    if not isinstance(marke, str):
        return ''
    marke = marke.lower().strip()
    marke = re.sub(r'[^\w\s]', '', marke)  # Remove non-alphanumeric characters except spaces
    marke = re.sub(r'\s+', ' ', marke)     # Replace multiple spaces with a single space
    return brand_mappings.get(marke, marke)

def remove_outliers_iqr(df, col, vehicle_type_col, fuel_type_col, result_path, graph_name):
    """
    Uses the IQR 1.5 method to detect and remove outliers from the specified kilometer column.
    Generates a summary table showing how many rows were removed as outliers for each vehicle type and fuel type,
    and includes the percentage of rows removed.

    :param df: DataFrame containing the dataset.
    :param col: Column that needs to be analyzed for outliers.
    :param vehicle_type_col: Column representing the type of vehicle.
    :param fuel_type_col: Column representing the fuel type.
    :param result_path: Directory to save the resulting CSV.
    :param graph_name: Name for the graph file to be saved.
    :return: A tuple containing:
        - summary_df: DataFrame with outlier statistics.
        - df: DataFrame after removing outliers.
    """

    summary_list = []

    # Unique combinations of vehicle types and fuel types
    unique_vehicle_types = df[vehicle_type_col].unique()
    unique_fuel_types = df[fuel_type_col].unique()

    # Iterate over each vehicle type and fuel type
    for vehicle_type in unique_vehicle_types:
        for fuel_type in unique_fuel_types:
            # Filter data for the current vehicle type and fuel type
            filtered_df = df[(df[vehicle_type_col] == vehicle_type) &
                             (df[fuel_type_col] == fuel_type)]

            total_rows = len(filtered_df)
            if total_rows == 0:
                continue

            # Calculate IQR for outlier detection
            Q1 = filtered_df[col].quantile(0.25)
            Q3 = filtered_df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            # Identify outliers
            outliers = filtered_df[(filtered_df[col] < lower_bound) |
                                   (filtered_df[col] > upper_bound)]
            outliers_count = len(outliers)

            # Remove outliers
            remaining_rows = total_rows - outliers_count
            percent_outliers = (outliers_count / total_rows) * 100

            # Append results to summary
            summary_list.append({
                'Vehicle Type': vehicle_type,
                'Fuel Type': fuel_type,
                'Total Rows': total_rows,
                'Outliers Found (Removed)': outliers_count,
                '% Outliers Found': round(percent_outliers,2)
            })

            # Remove outliers from the main DataFrame
            df = df[~((df[vehicle_type_col] == vehicle_type) &
                      (df[fuel_type_col] == fuel_type) &
                      ((df[col] < lower_bound) | (df[col] > upper_bound)))]

    # Create a summary DataFrame
    summary_df = pd.DataFrame(summary_list)

    # Create the "km_distribution_analysis" folder if it does not exist
    analysis_folder = os.path.join(result_path, 'km_distribution_analysis')
    os.makedirs(analysis_folder, exist_ok=True)

    # Save the summary DataFrame as a CSV file
    output_file = os.path.join(analysis_folder, f'{graph_name}.csv')
    summary_df.to_csv(output_file, index=False)

    return summary_df, df

def prepare_hybrid_data(df, result_path, vehicle_type):
    """
    Process hybrid-specific data by calculating consumption and removing outliers.

    :param df: Original DataFrame.
    :param result_path: Path to save outlier summaries.
    :param vehicle_type: The type of vehicle ('Vans' or 'Cars').
    :return: DataFrame ready for hybrid consumption analysis.
    """
    # Add hybrid indicator
    df['yra_hibridas'] = df['papildomi_degalai_1'].apply(lambda x: x == 'Elektra')

    # Drop invalid hybrid rows (invalid distances or consumption values)
    df = df[~(df['yra_hibridas'] &
              ((df['viso_nuvaziuota_variklis_veikia_km'].isna()) |
               (df['viso_nuvaziuota_variklis_veikia_km'] <= 0) |
               (df['viso_sunaudota_kuro_krovejo_veikimu_lt'].isna()) |
               (df['viso_sunaudota_kuro_krovejo_veikimu_lt'] <= 0)))]

    # Calculate electric consumption for hybrids
    df['energijos_sanaudos_wh_km_hybrid'] = np.where(
        (df['yra_hibridas'] & (df['viso_nuvaziuota_variklis_isjungtas_km'] > 0)),
        df['viso_is_tinklo_ikrauta_kwh'] * 1000 / df['viso_nuvaziuota_variklis_isjungtas_km'],
        np.nan
    )

    # Calculate fuel consumption for hybrids
    df['kuro_sunaudojimas_l100km_org'] = np.where(
        (df['yra_hibridas'] & (df['viso_nuvaziuota_variklis_veikia_km'] > 0)),
        df['viso_sunaudota_kuro_krovejo_veikimu_lt'] / df['viso_nuvaziuota_variklis_veikia_km'] * 100,
        df['kuro_sunaudojimas_l100km_org']
    )

    # Filter hybrids for outlier removal
    hybrids = df[df['yra_hibridas']]

    # Filter out rows where kuro_sunaudojimas_l100km_org is inf, -inf, or NaN
    df = df[~df['kuro_sunaudojimas_l100km_org'].isin([np.inf, -np.inf])]
    df = df[df['kuro_sunaudojimas_l100km_org'].notna()]

    # Print the number of rows removed
    print(f"Rows removed: {len(df)}")
    print(f"Remaining rows: {len(df)}")

    # Remove outliers from electric consumption
    summary_electric, hybrids = remove_outliers_iqr(
        hybrids,
        'energijos_sanaudos_wh_km_hybrid',
        'transporto_priemones_tipas',
        'degalai',
        result_path,
        f'hybrid_{vehicle_type}_energijos_sanaudos_wh_km_outliers_summary'
    )

    # Remove outliers from fuel consumption
    summary_fuel, hybrids = remove_outliers_iqr(
        hybrids,
        'kuro_sunaudojimas_l100km_org',
        'transporto_priemones_tipas',
        'degalai',
        result_path,
        f'hybrid_{vehicle_type}_kuro_sunaudojimas_l100km_outliers_summary'
    )

    # Merge back cleaned hybrids into the main dataset
    non_hybrids = df[~df['yra_hibridas']]
    cleaned_df = pd.concat([non_hybrids, hybrids], ignore_index=True)

    # Create summary table for hybrids vs non-hybrids
    comparison_summary = create_comparison_summary(cleaned_df, result_path, vehicle_type)

    # Update 'degalai' values for hybrids where 'yra_hibridas' is True
    cleaned_df.loc[df['yra_hibridas'] & (cleaned_df['degalai'] == 'Dyzelinas'), 'degalai'] = 'Dyzelinas_H'
    cleaned_df.loc[df['yra_hibridas'] & (cleaned_df['degalai'] == 'Benzinas'), 'degalai'] = 'Benzinas_H'

    return cleaned_df, summary_electric, summary_fuel, comparison_summary


def create_comparison_summary(df, result_path, vehicle_type):
    """
    Create a summary table comparing hybrid and non-hybrid fuel consumption, including standard deviations.

    :param df: DataFrame containing the dataset.
    :param result_path: Path to save the summary CSV.
    :param vehicle_type: The type of vehicle ('Vans' or 'Cars').
    :return: DataFrame containing the comparison summary.
    """

    # Calculate statistics for hybrids
    hybrid_summary = df[df['yra_hibridas']].groupby('degalai').agg(
        hybrid_fuel_median=('kuro_sunaudojimas_l100km_org', 'median'),
        hybrid_fuel_std=('kuro_sunaudojimas_l100km_org', 'std'),
        hybrid_electric_median=('energijos_sanaudos_wh_km_hybrid', 'median'),
        hybrid_electric_std=('energijos_sanaudos_wh_km_hybrid', 'std')
    ).reset_index()

    # Calculate statistics for non-hybrids where 'kuro_sunaudojimas_l100km_org' is not NaN
    non_hybrid_summary = (
        df[~df['yra_hibridas'] & df['kuro_sunaudojimas_l100km_org'].notna()]
        .groupby('degalai')
        .agg(
            non_hybrid_fuel_median=('kuro_sunaudojimas_l100km_org', 'median'),
            non_hybrid_fuel_std=('kuro_sunaudojimas_l100km_org', 'std')
        )
        .reset_index()
    )

    # Merge hybrid and non-hybrid summaries
    comparison_summary = pd.merge(
        hybrid_summary, non_hybrid_summary, on='degalai', how='inner'
    )

    # Rename columns to Lithuanian for consistency
    comparison_summary.rename(columns={
        'degalai': 'Kuro rūšis',
        'hybrid_fuel_median': 'Hibridų mediana (L/100km)',
        'hybrid_fuel_std': 'Hibridų STD (L/100km)',
        'hybrid_electric_median': 'Hibridų mediana (Wh/km)',
        'hybrid_electric_std': 'Hibridų STD (Wh/km)',
        'non_hybrid_fuel_median': 'Ne hibridų mediana (L/100km)',
        'non_hybrid_fuel_std': 'Ne hibridų STD (L/100km)'
    }, inplace=True)

    # Save comparison summary to CSV
    comparison_path = os.path.join(result_path, f'comparison_hybrid_{vehicle_type}_vs_non_hybrid.csv')
    comparison_summary.to_csv(comparison_path, index=False)

    return comparison_summary

def process_europa_real_world_dataset(vehicle_type, main_path):
    """
    Processes a dataset for a specific vehicle type (e.g., 'Vans' or 'Cars').

    :param vehicle_type: The type of vehicle ('Vans' or 'Cars').
    """
    # Load the dataset
    file_path=f"{main_path}/ea_europa_real_world_data_cars_vans/2022_{vehicle_type}_Raw.csv"
    output_path = f"{main_path}cleaned_2022_{vehicle_type}_Raw.csv"

    df = pd.read_csv(file_path)

    # Drop unnecessary columns
    columns_to_drop = [
        "Vehicle Identifier", "OBFCM data source", "Va", "Ve", "T", "Cr", "IT"
    ]
    df = df.drop(columns=columns_to_drop)

    # Inspect dataset
    print(df.dtypes)  # Display data types of each column
    print(df.info())  # Basic information about the dataset

    # Rename columns with Lithuanian translations
    column_rename_mapping = {
        'Vehicle Identifier': 'transporto_priemones_identifikatorius',
        'OBFCM data source': 'OBFCM_duomenu_saltinis',
        'OBFCM ReportingPeriod': 'OBFCM_ataskaitinis_laikotarpis',
        'Total fuel consumed (lifetime) (l)': 'viso_sunaudota_kuro_lt',
        'Total distance travelled (lifetime) (km)': 'viso_nuvaziuota_km',
        'Total distance travelled in charge depleting operation with engine off (lifetime) (km)': 'viso_nuvaziuota_variklis_isjungtas_km',
        'Total distance travelled in charge depleting operation with engine running (lifetime) (km)': 'viso_nuvaziuota_variklis_veikia_km',
        'Total distance travelled in driver-selectable charge increasing operation (lifetime) (km)': 'viso_nuvaziuota_baterija_papildomai_kraunama_km',
        'Total fuel consumed in charge depleting operation (lifetime) (l)': 'viso_sunaudota_kuro_krovejo_veikimu_lt',
        'Total fuel consumed in driver-selectable charge increasing operation (lifetime) (l)': 'viso_sunaudota_kuro_baterijos_krovimas_lt',
        'Total grid energy into the battery (lifetime) (kWh)': 'viso_is_tinklo_ikrauta_kwh',
        'Country': 'salis',
        'VFN': 'transporto_priemones_seimos_numeris',
        'Mp': 'gamintojas_motininis',
        'Mh': 'gamintojas_bustine',
        'Man': 'gamintojas',
        'T': 'tipas',
        'Va': 'versija',
        'Ve': 'variantas',
        'Mk': 'marke',
        'Cn': 'modelis',
        'Ct': 'kategorija',
        'Cr': 'klasifikavimo_kodas',
        'M (kg)': 'nuosava_mase_kg',
        'Mt': 'maksimali_mase_kg',
        'Ewltp (g/km)': 'ewltp_emisijos_g_km',
        'Ft': 'degalai',
        'Fm': 'kuro_rusis',
        'Ec (cm3)': 'variklio_turis_cm3',
        'Ep (KW)': 'galia_kw',
        'Z (Wh/km)': 'energijos_sanaudos_wh_km',
        'IT': 'techninis_identifikatorius',
        'Erwltp (g/km)': 'realios_emisijos_wltp_g_km',
        'Year': 'pagaminimo_metai',
        'Fuel consumption': 'kuro_sunaudojimas_l100km',
        'Electric range (km)': 'elektros_nuvaziuota_km',
        'Used in calculation': 'naudotas_skaiciavimams'
    }

    df.rename(columns=column_rename_mapping, inplace=True)

    # Make all 'degalai' values lowercase in the DataFrame
    df['degalai'] = df['degalai'].str.lower()

    # Map fuel types to standardized Lithuanian values
    fuel_mapping = {
        'petrol': 'Benzinas',
        'diesel': 'Dyzelinas',
        'petrol/electric': 'Benzinas/Elektra',
        'diesel/electric': 'Dyzelinas/Elektra',
        'lpg': 'Suskystintos naftos dujos',
        'ng': 'Gamtines dujos',
        'electric': 'Elektra',
        'e85': 'Etanolis',
        'ng-biomethane': 'Biometanas',
        'unknown': ''
    }

    # Record initial row count
    initial_row_count = df.shape[0]

    # Map 'degalai' values and fill NaN with empty strings
    df['degalai'] = df['degalai'].map(fuel_mapping).fillna('')

    # Delete rows where 'degalai' is empty after mapping
    df = df[df['degalai'] != '']

    # Calculate the number of rows deleted
    rows_deleted = initial_row_count - df.shape[0]

    # Print the results
    print(f"Rows deleted where 'degalai' was empty: {rows_deleted}")
    print(f"Remaining rows after deletion: {df.shape[0]}")

    # Split and validate additional fuel types
    df_split = df['degalai'].str.split('/', expand=True)
    df['degalai'] = df_split[0]
    df['papildomi_degalai_1'] = df_split[1].fillna('')

    valid_additional_fuels = [
        '', 'Dujos', 'Elektra', 'Suskystintos naftos dujos', 'Etanolis',
        'Gamtines dujos', 'Dyzelinas', 'Biometanas', 'Benzinas'
    ]
    df['papildomi_degalai_1'] = np.where(df['papildomi_degalai_1'].isin(valid_additional_fuels),
                                        df['papildomi_degalai_1'], '')


    # Calculate fuel consumption per 100 km
    df['kuro_sunaudojimas_l100km_org'] = (df['viso_sunaudota_kuro_lt'] / df['viso_nuvaziuota_km']) * 100

    # Print value counts for 'kategorija' column
    print("Value Counts for 'kategorija':")
    print(df['kategorija'].value_counts())

    # Print the number of missing values in 'kategorija' column
    print("\nNumber of Missing Values in 'kategorija':")
    print(df['kategorija'].isna().sum())

    # Create conditions using np.where
    df['transporto_priemones_tipas'] = np.where(df['kategorija'].isin(['M1', 'M1G', 'm1']), 'K2',
                                                  np.where(df['kategorija'].isin(['N1', 'N1G']), 'K6', None))

    df['transporto_priemones_paaiskinimas'] = np.where(df['kategorija'].isin(['M1', 'M1G', 'm1']), 'Lengvieji automobiliai',
                                                          np.where(df['kategorija'].isin(['N1', 'N1G']), 'Krovininiai automobiliai', None))

    # Process and prepare data
    df, summary_electric, summary_fuel, comparison_summary = prepare_hybrid_data(df, result_path, vehicle_type)

    # Print summaries
    print("\nElectric Consumption Outlier Summary:")
    print(summary_electric)

    print("\nFuel Consumption Outlier Summary:")
    print(summary_fuel)

    print("\nComparison Summary (Hybrids vs Non-Hybrids):")
    print(comparison_summary)

    # Initial data overview
    print("Initial dataset statistics:")
    initial_count = df.shape[0]
    print(f"Total rows: {initial_count}")
    print("Country counts before deletion:")
    print(df['salis'].value_counts())

    print("Unique 'degalai' values before deleting filtering data:", df['degalai'].value_counts())

    # List of countries to delete
    countries_to_delete = [
        "AT",  # Austria - Mostly in the Alps, significant mountainous terrain.
        "SI",  # Slovenia - Contains parts of the Alps and the Dinaric Alps.
        "IT",  # Italy - The Alps and the Apennines are significant, overall significant mountainous terrain.
        "ES",  # Spain - Pyrenees, Sistema Bético, Sistema Central, and Cantabrian Mountains, significant mountainous terrain.
        "PT",  # Portugal - Contains Sistema Central and other mountain ranges.
        "NO",  # Norway - Mostly in the Scandinavian Mountains.
        "SE",  # Sweden - Partially includes the Scandinavian Mountains.
        "FI",  # Finland - Contains the Scandinavian Mountains in the northwest.
        "BG",  # Bulgaria - Balkan Mountains and Rila-Rhodope mountain ranges.
        "RO",  # Romania - Mostly in the Carpathians.
        "CZ",  # Czech Republic - Contains Sudetes, Ore Mountains, and other ranges.
        "SK",  # Slovakia - Mostly in the Carpathians (e.g., Tatra Mountains).
        "GR",  # Greece - Pindus Mountains and other ranges.
        "HR",  # Croatia - Part of the Dinaric Alps.
        "BA",  # Bosnia and Herzegovina - Part of the Dinaric Alps.
        "RS",  # Serbia - Includes Dinaric Alps and Carpathians.
        "ME",  # Montenegro - Mostly in the Dinaric Alps.
        "AL",  # Albania - Contains the Dinaric Alps and the Accursed Mountains.
        "MK"   # North Macedonia - Includes the Šar and Rila-Rhodope ranges.
    ]

    # Filter only by countries (without considering fuel types)
    df_country_filtered = df[~df['salis'].isin(countries_to_delete)]

    # Calculate statistics for filtering only by countries
    deleted_country_only_count = initial_count - df_country_filtered.shape[0]
    deleted_country_only_percentage = (deleted_country_only_count / initial_count) * 100

    # Print statistics for filtering only by countries
    print("\nAfter filtering only by countries on all fuel types:")
    print(f"Rows deleted (countries only): {deleted_country_only_count}")
    print(f"Percentage of data deleted (countries only): {deleted_country_only_percentage:.2f}%")
    print(f"Remaining rows after filtering only by countries: {df_country_filtered.shape[0]}")
    print("Unique 'degalai' values:", df_country_filtered['degalai'].value_counts())

    df = df[~((df['salis'].isin(countries_to_delete)) & (df['degalai'].isin(['Benzinas','Benzinas_H', 'Dyzelinas','Dyzelinas_H'])))]

    # Calculate the number of rows deleted after country filtering
    deleted_country_count = initial_count - df.shape[0]
    deleted_country_percentage = (deleted_country_count / initial_count) * 100

    # Print statistics about country deletion
    print("\nAfter filtering countries on Dyzelinas and Benzinas for hybrid and non-hybrid:")
    print(f"Rows deleted (countries): {deleted_country_count}")
    print(f"Percentage of data deleted (countries): {deleted_country_percentage:.2f}%")
    print(f"Remaining rows after country filtering: {df.shape[0]}")
    print("Unique 'degalai' values:", df['degalai'].value_counts())

    # Record the initial number of rows before filtering
    initial_distance_row_count = df.shape[0]

    # Filter out records with unrealistic total distances (<5 km)
    df = df[df['viso_nuvaziuota_km'] >= 5]

    # Calculate the number of rows deleted after distance filtering
    deleted_distance_count = initial_distance_row_count - df.shape[0]
    deleted_distance_percentage = (deleted_distance_count / initial_distance_row_count) * 100

    # Print statistics about distance filtering
    print("\nAfter filtering distances:")
    print(f"Rows deleted (distance < 5 km): {deleted_distance_count}")
    print(f"Percentage of data deleted (distance < 5 km): {deleted_distance_percentage:.2f}%")
    print(f"Remaining rows after distance filtering: {df.shape[0]}")

    # Clean 'marke' (brand) column
    brand_mappings = {
        'vw': 'volkswagen',
        'volkswagen vw': 'volkswagen',
        'volkswagen ag': 'volkswagen',
        'volkswagen mobilcar': 'volkswagen',
        'volkswagen.vw': 'volkswagen',
        'volkswagen, vw': 'volkswagen',
        'volkswagen. vw': 'volkswagen',
        'volkswagen  vw': 'volkswagen',
        'volkswagn': 'volkswagen',
        'ford d': 'ford',
        'landrover': 'land rover',
        'land rover': 'land rover',
        'opelvauxhall': 'opel',
        'opel/vauxhall': 'opel',
        'opel vauxhall': 'opel',
        'peugeot dangel': 'peugeot',
        'fiat/jegger': 'fiat',
        'fiat-hagemann': 'fiat',
        'fiathagemann': 'fiat',
        'fiatjegger': 'fiat',
        'dangel': 'peugeot',
        'renault': 'renault',
        'mercedesbenz': 'mercedes-benz',
        'mercedes-benz': 'mercedes-benz',
        'carrocerias sanca sa': 'carrocerias sanca',
        'carrocerias sanca': 'carrocerias sanca',
        'alfa romeo': 'alfa romeo',
        'vemasur': 'vemasur',
        'rhonsom': 'rhonsom',
        'boeckamann': 'boeckmann',
        'maxus': 'maxus',
        'mg': 'mg',
        'ds': 'ds',
        'volkswagenmobilcar': 'volkswagen',
        'volkswagenvw': 'volkswagen',
        'volvo': 'volvo',
        'seat': 'seat',
        'fiattranspol': 'fiat',
        'moto star': 'moto star',
        'snoeks automotive': 'snoeks',
        'zago automotive': 'zago',
        'porsche': 'porsche',
        'sortimo': 'sortimo',
        'igluvan': 'igluvan',
        'erke': 'erke',
        'edward davies': 'edward davies',
        'cht': 'cht',
        'ssangyong': 'ssangyong',
        'mazda': 'mazda',
        '93': 'unknown',
    }
    clean_marke_partial = partial(clean_marke, brand_mappings=brand_mappings)
    df['cleaned_marke'] = df['marke'].apply(clean_marke_partial)

    # Clean 'modelis' (model) column
    unique_combinations = df[['marke', 'modelis']].drop_duplicates()

    # Fill NaN values with empty strings to avoid errors
    unique_combinations['marke'] = unique_combinations['marke'].fillna('').astype(str)
    unique_combinations['modelis'] = unique_combinations['modelis'].fillna('').astype(str)

    unique_combinations['cleaned_modelis'] = unique_combinations.apply(
        lambda row: row['modelis'].lower().replace(row['marke'].lower(), '').strip()
        if row['marke'].lower() in row['modelis'].lower()
        else row['modelis'].lower().strip(),
        axis=1
    )
    cleaned_map = dict(zip(unique_combinations.apply(lambda row: (row['marke'], row['modelis']), axis=1),
                          unique_combinations['cleaned_modelis']))

    df['cleaned_modelis'] = df.apply(lambda row: cleaned_map.get((row['marke'], row['modelis']), row['modelis']), axis=1)

    # Save cleaned data
    df.to_csv(output_path, index=False)

    # Display results
    print("\nFinal Dataset Statistics:")
    print(f"Total rows: {df.shape[0]}")
    print("Unique 'degalai' values:", df['degalai'].value_counts())
    print("Unique 'papildomi_degalai_1' values:", df['papildomi_degalai_1'].value_counts())
    print("Cleaned unique 'marke' values:", df['cleaned_marke'].nunique())
    print("Cleaned unique 'modelis' values:", df['cleaned_modelis'].nunique())
    print(f"Unique countries: {df['salis'].nunique()}")
    print("Country value counts:")
    print(df['salis'].value_counts())

    return df

In [4]:
df_vans = process_europa_real_world_dataset("Vans", main_path)

OBFCM ReportingPeriod                                                                           int64
Total fuel consumed (lifetime) (l)                                                            float64
Total distance travelled (lifetime) (km)                                                      float64
Total distance travelled in charge depleting operation with engine off (lifetime) (km)        float64
Total distance travelled in charge depleting operation with engine running (lifetime) (km)    float64
Total distance travelled in driver-selectable charge increasing operation (lifetime) (km)     float64
Total fuel consumed in charge depleting operation (lifetime) (l)                              float64
Total fuel consumed in driver-selectable charge increasing operation (lifetime) (l)           float64
Total grid energy into the battery (lifetime) (kWh)                                           float64
Country                                                                           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['energijos_sanaudos_wh_km_hybrid'] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['kuro_sunaudojimas_l100km_org'] = np.where(


Rows removed: 199670
Remaining rows: 199670

Electric Consumption Outlier Summary:
  Vehicle Type  Fuel Type  Total Rows  Outliers Found (Removed)  \
0           K6   Benzinas         159                         3   
1           K2   Benzinas         435                        56   
2           K2  Dyzelinas           3                         0   

   % Outliers Found  
0              1.89  
1             12.87  
2              0.00  

Fuel Consumption Outlier Summary:
  Vehicle Type  Fuel Type  Total Rows  Outliers Found (Removed)  \
0           K6   Benzinas         156                         8   
1           K2   Benzinas         379                        49   
2           K2  Dyzelinas           3                         0   

   % Outliers Found  
0              5.13  
1             12.93  
2              0.00  

Comparison Summary (Hybrids vs Non-Hybrids):
  Kuro rūšis  Hibridų mediana (L/100km)  Hibridų STD (L/100km)  \
0   Benzinas                   8.801964               2.

In [5]:
df_cars = process_europa_real_world_dataset("Cars", main_path)

OBFCM ReportingPeriod                                                                           int64
Total fuel consumed (lifetime) (l)                                                            float64
Total distance travelled (lifetime) (km)                                                      float64
Total distance travelled in charge depleting operation with engine off (lifetime) (km)        float64
Total distance travelled in charge depleting operation with engine running (lifetime) (km)    float64
Total distance travelled in driver-selectable charge increasing operation (lifetime) (km)     float64
Total fuel consumed in charge depleting operation (lifetime) (l)                              float64
Total fuel consumed in driver-selectable charge increasing operation (lifetime) (l)           float64
Total grid energy into the battery (lifetime) (kWh)                                           float64
Country                                                                           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['energijos_sanaudos_wh_km_hybrid'] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['kuro_sunaudojimas_l100km_org'] = np.where(


Rows removed: 3679708
Remaining rows: 3679708

Electric Consumption Outlier Summary:
  Vehicle Type  Fuel Type  Total Rows  Outliers Found (Removed)  \
0           K2   Benzinas      380186                     61598   
1           K2  Dyzelinas       68251                      7128   

   % Outliers Found  
0             16.20  
1             10.44  

Fuel Consumption Outlier Summary:
  Vehicle Type  Fuel Type  Total Rows  Outliers Found (Removed)  \
0           K2   Benzinas      318588                     22495   
1           K2  Dyzelinas       61123                      1868   

   % Outliers Found  
0              7.06  
1              3.06  

Comparison Summary (Hybrids vs Non-Hybrids):
  Kuro rūšis  Hibridų mediana (L/100km)  Hibridų STD (L/100km)  \
0   Benzinas                   9.306063               2.725641   
1  Dyzelinas                   9.739368               1.501185   

   Hibridų mediana (Wh/km)  Hibridų STD (Wh/km)  Ne hibridų mediana (L/100km)  \
0               24

In [6]:
# Create new columns to indicate if the row belongs to cars or vans
df_cars['eu_cars'] = 1  # 1 indicates it's a car
df_cars['eu_vans'] = 0  # 0 indicates it's not a van

df_vans['eu_cars'] = 0  # 0 indicates it's not a car
df_vans['eu_vans'] = 1  # 1 indicates it's a van

# Combine the two dataframes into one, ensuring both columns exist
df = pd.concat([df_cars, df_vans], ignore_index=True)

# Print value counts for 'kategorija' column
print("Value Counts for 'kategorija':")
print(df['kategorija'].value_counts())

# Print the number of missing values in 'kategorija' column
print("\nNumber of Missing Values in 'kategorija':")
print(df['kategorija'].isna().sum())


Value Counts for 'kategorija':
kategorija
M1     1904989
M1G     142967
N1      103512
N1G      11906
m1           1
Name: count, dtype: int64

Number of Missing Values in 'kategorija':
0


In [7]:
# Outliers for 'Dyzelinas_H' and 'Benzinas_H' were already removed during the hybrid analysis.
# Exclude these fuel types from further outlier removal.

# Filter out 'Dyzelinas_H' and 'Benzinas_H' before running the outlier removal
excluded_fuel_types = ['Dyzelinas_H', 'Benzinas_H']
excluded_df = df[df['degalai'].isin(excluded_fuel_types)]  # Keep excluded fuel types
to_filter_df = df[~df['degalai'].isin(excluded_fuel_types)]  # Data for outlier removal

# Run outlier removal only on the filtered data
summary_df, filtered_df = remove_outliers_iqr(
    to_filter_df,
    'viso_nuvaziuota_km',
    'transporto_priemones_tipas',
    'degalai',
    result_path + 'eu_analysis/',
    'viso_nuvaziuota_km_outliers_summary'
)

# Combine excluded fuel types back with the filtered results
combined_df = pd.concat([excluded_df, filtered_df], ignore_index=True)

# Display the summary DataFrame and remaining rows
print(summary_df)
print(f"Remaining rows after outlier removal: {len(filtered_df)}")
print(f"Total rows after combining: {len(combined_df)}")

df = combined_df.copy()

   Vehicle Type                  Fuel Type  Total Rows  \
0            K2                   Benzinas     1217796   
1            K2                  Dyzelinas      542648   
2            K2  Suskystintos naftos dujos       12513   
3            K2             Gamtines dujos         113   
4            K2                    Elektra           3   
5            K2                   Etanolis       23726   
6            K2                 Biometanas        1597   
7            K6                   Benzinas        4250   
8            K6                  Dyzelinas      110127   
9            K6  Suskystintos naftos dujos           9   
10           K6             Gamtines dujos          80   
11           K6                   Etanolis         421   
12           K6                 Biometanas         163   

    Outliers Found (Removed)  % Outliers Found  
0                      44768              3.68  
1                      23143              4.26  
2                       1358            

In [8]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
df = pd.read_csv(main_path + 'eu_total_final.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Master Paper/Data/eu_total_final_full.csv'

In [9]:
# Updated function to include data percentage in the histogram titles for Benzinas and Dyzelinas
# Updated function to plot average fuel consumption 'kuro_sunaudojimas_l100km' without x-axis range limits
def plot_fuel_consumption_distribution(df, result_path, km_column, transporto_priemones_tipas_col, degalai_col, folder_name):
    """
    Creates histograms for the 'kuro_sunaudojimas_l100km' column based on vehicle type and fuel type.
    Saves the plots as images without x-axis range limits or data percentage in the titles.

    :param df: DataFrame containing the data.
    :param result_path: Directory to save the results.
    :param km_column: Column with fuel consumption data.
    :param transporto_priemones_tipas_col: Column with vehicle type information.
    :param degalai_col: Column with fuel type information.
    :param: folder_name: Name of the folder to save the results.
    """

    # Create the "fuel_consumption_analysis" folder if it doesn't exist
    analysis_folder = os.path.join(result_path, folder_name)
    os.makedirs(analysis_folder, exist_ok=True)

    # Unique combinations of vehicle types and fuel types
    unique_types = df[transporto_priemones_tipas_col].unique()
    unique_fuel_types = df[degalai_col].unique()

    # Iterate over each vehicle type
    for transporto_tipas in unique_types:
        # Filter data by vehicle type
        type_df = df[df[transporto_priemones_tipas_col] == transporto_tipas]

        for degalai in unique_fuel_types:
            # Filter data by vehicle type and fuel type
            filtered_df = type_df[type_df[degalai_col] == degalai]

            # Check if there is enough data
            if len(filtered_df) > 0:
                plt.figure(figsize=(10, 6))
                sns.histplot(filtered_df[km_column].dropna(), kde=True, bins=100)
                plt.xlabel('Average Fuel Consumption (L/100km)')
                plt.ylabel('Frequency')

                # Set y-axis to normal number formatting (not scientific)
                plt.ticklabel_format(style='plain', axis='y')

                plt.tight_layout()

                # Save the histogram
                output_file = os.path.join(analysis_folder, f"{transporto_tipas}_{degalai}_fuel_consumption_distribution.png")
                plt.savefig(output_file, dpi=300)
                plt.close()

plot_fuel_consumption_distribution(df, result_path + 'eu_analysis/', 'kuro_sunaudojimas_l100km_org', 'transporto_priemones_tipas', 'degalai','fuel_consumption_analysis_with_outliers')


In [10]:
# Outliers for 'Dyzelinas_H' and 'Benzinas_H' were already removed during the hybrid analysis.
# Exclude these fuel types from further outlier removal.

# Filter out 'Dyzelinas_H' and 'Benzinas_H' before running the outlier removal
excluded_fuel_types = ['Dyzelinas_H', 'Benzinas_H']
excluded_df = df[df['degalai'].isin(excluded_fuel_types)]  # Keep excluded fuel types
to_filter_df = df[~df['degalai'].isin(excluded_fuel_types)]  # Data for outlier removal

# Record initial sizes
initial_size = len(to_filter_df)
print(f"Initial dataset size for outlier removal: {initial_size} rows (excluding 'Dyzelinas_H' and 'Benzinas_H')")

# Run outlier removal using the IQR method
summary_df, filtered_df = remove_outliers_iqr(
    to_filter_df,
    'kuro_sunaudojimas_l100km_org',
    'transporto_priemones_tipas',
    'degalai',
    result_path + 'eu_analysis/',
    'kuro_sunaudojimas__w_hybrid_l100km_org_outliers_summary'
)

# Calculate rows removed
rows_removed = initial_size - len(filtered_df)
print(f"Rows removed during outlier removal: {rows_removed}")
print(f"Remaining rows after outlier removal: {len(filtered_df)} rows")

# Combine excluded fuel types back with the filtered results
combined_df = pd.concat([excluded_df, filtered_df], ignore_index=True)

# Print the total dataset size after combining
print(f"Total dataset size after combining: {len(combined_df)} rows")

# Update the main DataFrame
df = combined_df.copy()


Initial dataset size for outlier removal: 1835083 rows (excluding 'Dyzelinas_H' and 'Benzinas_H')
Rows removed during outlier removal: 200462
Remaining rows after outlier removal: 1634621 rows
Total dataset size after combining: 1884550 rows


In [11]:
# Filter for 'Benzinas_H' and 'Dyzelinas_H' with 'transporto_priemones_tipas' as 'K2'
hybrid_filter = (df['degalai'].isin(['Benzinas_H', 'Dyzelinas_H'])) & (df['transporto_priemones_tipas'] == 'K2')

groups = df[df['transporto_priemones_tipas'] == 'K2'].groupby('degalai')['kuro_sunaudojimas_l100km_org'].max()

print("Maximum fuel consumption values for K2 transport vehicles:")
print(groups)

# Capture the initial number of hybrid rows
initial_hybrid_count = len(df[hybrid_filter])

# Apply the max value filter for hybrids (<= 15.27) (Max value for non hybrids between Dyzelis and Benzinas) to delete outliers
filtered_hybrids = df[hybrid_filter & (df['kuro_sunaudojimas_l100km_org'] <= 15.27)]

# Combine filtered hybrids with the rest of the dataset
non_hybrid_data = df[~hybrid_filter]
df = pd.concat([non_hybrid_data, filtered_hybrids], ignore_index=True)

# Calculate the number of deleted rows for hybrids
deleted_hybrid_rows = initial_hybrid_count - len(filtered_hybrids)

# Print the results
print(f"Rows deleted for hybrids (Benzinas_H and Dyzelinas_H): {deleted_hybrid_rows}")
print(f"Rows retained for hybrids after filtering: {len(filtered_hybrids)}")
print(f"Total rows in the dataset after filtering: {len(df)}")


Maximum fuel consumption values for K2 transport vehicles:
degalai
Benzinas                      15.266293
Benzinas_H                   625.180638
Biometanas                    15.331126
Dyzelinas                     13.073947
Dyzelinas_H                  173.333333
Elektra                        9.403498
Etanolis                      10.213729
Gamtines dujos                31.594203
Suskystintos naftos dujos     40.000000
Name: kuro_sunaudojimas_l100km_org, dtype: float64
Rows deleted for hybrids (Benzinas_H and Dyzelinas_H): 17286
Rows retained for hybrids after filtering: 232275
Total rows in the dataset after filtering: 1867264


In [12]:
grouped_data = df.groupby(['transporto_priemones_tipas', 'degalai']).size().reset_index(name='row_count')

# Save the data as a CSV file
output_path = 'row_counts_by_vehicle_and_fuel_type.csv'
grouped_data.to_csv(result_path + output_path, index=False)
print(grouped_data)

   transporto_priemones_tipas                    degalai  row_count
0                          K2                   Benzinas    1022762
1                          K2                 Benzinas_H     157569
2                          K2                 Biometanas       1298
3                          K2                  Dyzelinas     478057
4                          K2                Dyzelinas_H      74706
5                          K2                    Elektra          3
6                          K2                   Etanolis      19144
7                          K2             Gamtines dujos         95
8                          K2  Suskystintos naftos dujos      11142
9                          K6                   Benzinas       3739
10                         K6                 Benzinas_H         24
11                         K6                 Biometanas        132
12                         K6                  Dyzelinas      97769
13                         K6                Dyz

In [35]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def plot_combined_boxplot_with_annotations(df, result_path, transporto_priemones_tipas_col, degalai_col, column_org, column_updated, folder_name):
    """
    Creates a single box plot per `transporto_priemones_tipas`, comparing `column_org` and `column_updated`
    for each `degalai` in different colors and annotates the median values on the graph. Saves the plots as images.

    :param df: DataFrame containing the data.
    :param result_path: Directory to save the results.
    :param transporto_priemones_tipas_col: Column with vehicle type information.
    :param degalai_col: Column with fuel type information.
    :param column_org: Column with original fuel consumption values.
    :param column_updated: Column with updated fuel consumption values.
    :param folder_name: Name of the folder to save the results.
    """
    # Map Lithuanian fuel types to English
    fuel_mapping = {
        'Dyzelinas_H': 'Diesel Hybrid',
        'Benzinas_H': 'Petrol Hybrid',
        'Benzinas': 'Petrol',
        'Dyzelinas': 'Diesel',
        'Suskystintos naftos dujos': 'LPG',
        'Gamtines dujos': 'Natural Gas',
        'Elektra': 'Electric',
        'Etanolis': 'Ethanol',
        'Biometanas': 'Biomethane'
    }
    df[degalai_col] = df[degalai_col].map(fuel_mapping).fillna(df[degalai_col]).copy()

    # Ensure the output folder exists
    analysis_folder = os.path.join(result_path, folder_name)
    os.makedirs(analysis_folder, exist_ok=True)

    # Unique vehicle types
    unique_vehicle_types = df[transporto_priemones_tipas_col].unique()

    for vehicle_type in unique_vehicle_types:
        # Filter data by vehicle type
        filtered_df = df[df[transporto_priemones_tipas_col] == vehicle_type]

        if not filtered_df.empty:
            # Melt the DataFrame to plot both columns in a single graph
            melted_df = pd.melt(
                filtered_df,
                id_vars=[degalai_col],
                value_vars=[column_org, column_updated],
                var_name='Fuel Consumption Type',
                value_name='Fuel Consumption (L/100km)'
            )

            # Update column names for the legend
            melted_df['Fuel Consumption Type'] = melted_df['Fuel Consumption Type'].replace({
                column_org: 'Real World',
                column_updated: 'Manufacturer Provided'
            })

            # Sort fuel types alphabetically in descending order
            sorted_fuel_types = sorted(melted_df[degalai_col].unique(), reverse=True)
            melted_df[degalai_col] = pd.Categorical(melted_df[degalai_col], categories=sorted_fuel_types, ordered=True)

            plt.figure(figsize=(14, 10))

            # Create a box plot
            sns.boxplot(
                data=melted_df,
                x=degalai_col,
                y='Fuel Consumption (L/100km)',
                hue='Fuel Consumption Type',
                palette=['#1f77b4', '#ff7f0e']
            )

            plt.ylim(0, 30)  # Set maximum y-axis value for visibility
            plt.title(f'Fuel Consumption Comparison for {vehicle_type}', fontsize=18, fontweight='bold')
            plt.xlabel('Fuel Type', fontsize=16, fontweight='bold')
            plt.ylabel('Fuel Consumption (L/100km)', fontsize=16, fontweight='bold')
            plt.xticks(rotation=45, ha='right', fontsize=14)
            plt.yticks(fontsize=14)

            # Add median annotations
            for i, fuel_type in enumerate(sorted_fuel_types):
                for j, consumption_type in enumerate(['Real World', 'Manufacturer Provided']):
                    median_value = melted_df[
                        (melted_df[degalai_col] == fuel_type) & (melted_df['Fuel Consumption Type'] == consumption_type)
                    ]['Fuel Consumption (L/100km)'].median()

                    if not pd.isna(median_value):
                        plt.text(
                            x=i + (j - 0.5) * 0.2,  # Adjust x position slightly for each type
                            y=median_value,  # Center the annotation vertically
                            s=f'{median_value:.1f}',
                            ha='center',
                            va='center',
                            fontsize=14,
                            fontweight='bold',
                            color='black',
                            bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', boxstyle='round,pad=0.3')
                        )

            plt.legend(title='Consumption Type', loc='upper right', fontsize=14, title_fontsize=16)
            plt.tight_layout()

            # Save the plot
            output_file = os.path.join(analysis_folder, f"{vehicle_type}_fuel_comparison_boxplot.png")
            plt.savefig(output_file, dpi=300)
            plt.close()

# Example usage
plot_combined_boxplot_with_annotations(
    df,
    result_path=result_path + 'eu_analysis',
    transporto_priemones_tipas_col='transporto_priemones_tipas',
    degalai_col='degalai',
    column_org='kuro_sunaudojimas_l100km_org',
    column_updated='kuro_sunaudojimas_l100km',
    folder_name='fuel_comparison_boxplots_with_annotations'
)


In [15]:
df

Unnamed: 0,OBFCM_ataskaitinis_laikotarpis,viso_sunaudota_kuro_lt,viso_nuvaziuota_km,viso_nuvaziuota_variklis_isjungtas_km,viso_nuvaziuota_variklis_veikia_km,viso_nuvaziuota_baterija_papildomai_kraunama_km,viso_sunaudota_kuro_krovejo_veikimu_lt,viso_sunaudota_kuro_baterijos_krovimas_lt,viso_is_tinklo_ikrauta_kwh,salis,...,kuro_sunaudojimas_l100km_org,transporto_priemones_tipas,transporto_priemones_paaiskinimas,yra_hibridas,energijos_sanaudos_wh_km_hybrid,cleaned_marke,cleaned_modelis,eu_cars,eu_vans,Mf (kg)
0,2022,21.65,237.5,,,,,,,BE,...,9.115789,K6,Krovininiai automobiliai,False,,ford,transit custom,0,1,2091.0
1,2022,448.58,5709.9,,,,,,,PL,...,7.856180,K6,Krovininiai automobiliai,False,,ford,transit,0,1,2173.0
2,2022,762.97,6173.9,,,,,,,DE,...,12.357991,K6,Krovininiai automobiliai,False,,ford,ranger,0,1,2561.0
3,2022,543.91,4452.5,,,,,,,DE,...,12.215834,K6,Krovininiai automobiliai,False,,ford,ranger,0,1,2382.0
4,2022,945.94,8404.8,,,,,,,DE,...,11.254759,K6,Krovininiai automobiliai,False,,ford,ranger,0,1,2384.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1867259,2022,3.65,39.0,,,,,,,DK,...,9.358974,K2,Lengvieji automobiliai,False,,volkswagen,touran,0,1,
1867260,2022,5886.31,80807.4,,,,,,,DK,...,7.284370,K2,Lengvieji automobiliai,False,,volkswagen,touran,0,1,
1867261,2022,1563.19,25677.0,,,,,,,DK,...,6.087900,K2,Lengvieji automobiliai,False,,ford,kuga,0,1,1735.0
1867262,2022,2375.19,38988.4,,,,,,,DK,...,6.092043,K2,Lengvieji automobiliai,False,,ford,kuga,0,1,1576.0


In [21]:
# Plot the distribution of fuel consumption (L/100km) for different vehicle types and fuel types after outliers deletion with IQR 1.5.
plot_fuel_consumption_distribution(df, result_path + 'eu_analysis/', 'kuro_sunaudojimas_l100km_org', 'transporto_priemones_tipas', 'degalai','fuel_consumption_analysis_without_outliers')

In [22]:
import os
import pandas as pd

def save_describe_statistics(df, result_path, describe_columns, transporto_priemones_tipas_col, degalai_col):
    """
    Generates and saves descriptive statistics for the specified columns for each combination
    of vehicle type and fuel type.

    :param df: DataFrame containing the data.
    :param result_path: Directory to save the results.
    :param describe_columns: List of columns to calculate descriptive statistics for.
    :param transporto_priemones_tipas_col: Column with vehicle type information.
    :param degalai_col: Column with fuel type information.
    """

    # Create the "describe_analysis" folder if it doesn't exist
    describe_folder = os.path.join(result_path, 'describe_analysis')
    os.makedirs(describe_folder, exist_ok=True)

    # Unique combinations of vehicle types and fuel types
    unique_types = df[transporto_priemones_tipas_col].unique()
    unique_fuel_types = df[degalai_col].unique()

    # Iterate over each vehicle type and fuel type
    for transporto_tipas in unique_types:
        type_df = df[df[transporto_priemones_tipas_col] == transporto_tipas]

        for degalai in unique_fuel_types:
            filtered_df = type_df[type_df[degalai_col] == degalai]

            # Check if there is data to describe
            if len(filtered_df) > 0:
                # Generate descriptive statistics for the specified columns
                describe_stats = filtered_df[describe_columns].describe()

                # Save the statistics to a CSV file
                output_file = os.path.join(describe_folder, f"{transporto_tipas}_{degalai}_describe.csv")
                describe_stats.to_csv(output_file)

    print(f"Descriptive statistics saved in {describe_folder}")

# Example usage
columns_to_describe = ['kuro_sunaudojimas_l100km', 'kuro_sunaudojimas_l100km_org', 'variklio_turis_cm3', 'galia_kw', 'nuosava_mase_kg', 'maksimali_mase_kg', 'pagaminimo_metai']
save_describe_statistics(
    df,
    result_path + 'eu_analysis/',
    columns_to_describe,
    'transporto_priemones_tipas',
    'degalai'
)


Descriptive statistics saved in /content/drive/MyDrive/Master Paper/Results/eu_analysis/describe_analysis


In [45]:
# Updated function to include data percentage in the histogram titles for Benzinas and Dyzelinas
def plot_km_distribution_with_percentage(df, result_path, km_column, transporto_priemones_tipas_col, degalai_col):
    """
    Creates histograms for the 'viso_nuvaziuota_km' column based on vehicle type and fuel type.
    For Benzinas and Dyzelinas, it generates three separate plots for different kilometer ranges (0-100, 100-10,000, 10,000-100,000 km).
    The title of each plot includes the percentage of data within the specified range.
    Saves the plots as image files.

    :param df: DataFrame containing the data.
    :param result_path: Directory to save the resulting plots.
    :param km_column: Column containing the kilometers driven.
    :param transporto_priemones_tipas_col: Column indicating the vehicle type.
    :param degalai_col: Column indicating the fuel type.
    """

    # Create the "km_distribution_analysis" folder if it does not exist
    analysis_folder = os.path.join(result_path, 'km_distribution_analysis')
    os.makedirs(analysis_folder, exist_ok=True)

    # Unique combinations of vehicle types and fuel types
    unique_types = df[transporto_priemones_tipas_col].unique()
    unique_fuel_types = df[degalai_col].unique()

    # Iterate over each vehicle type
    for transporto_tipas in unique_types:
        # Filter data by vehicle type
        type_df = df[df[transporto_priemones_tipas_col] == transporto_tipas]

        for degalai in unique_fuel_types:
            # Filter data by vehicle type and fuel type
            filtered_df = type_df[type_df[degalai_col] == degalai]

            # Check if there is enough data
            if len(filtered_df) > 0:
                total_data_points = len(filtered_df)

                if degalai in ['Petrol1', 'Diesel1']:
                    ranges = [(0, 100), (100, 10000), (10000, 100000)]
                    for r_min, r_max in ranges:
                        # Filter data within the current range
                        range_df = filtered_df[(filtered_df[km_column] >= r_min) & (filtered_df[km_column] <= r_max)]
                        range_data_points = len(range_df)

                        # Calculate the percentage of data in this range
                        data_percentage = (range_data_points / total_data_points) * 100

                        # Plot the histogram
                        plt.figure(figsize=(10, 6))
                        sns.histplot(range_df[km_column].dropna(), kde=True, bins=100)
                        plt.xlabel('Kilometers Driven')
                        plt.ylabel('Frequency')

                        # Apply x-axis limit for the current range
                        plt.xlim(r_min, r_max)

                        # Set y-axis to normal number formatting (not scientific)
                        plt.ticklabel_format(style='plain', axis='y')

                        plt.tight_layout()

                        # Save the histogram
                        output_file = os.path.join(analysis_folder, f"{transporto_tipas}_{degalai}_km_distribution_{r_min}_to_{r_max}.png")
                        plt.savefig(output_file, dpi=300)
                        plt.close()

                # For other fuel types, create a general plot
                plt.figure(figsize=(10, 6))
                sns.histplot(filtered_df[km_column].dropna(), kde=True, bins=100)
                plt.xlabel('Kilometers Driven')
                plt.ylabel('Frequency')
                plt.tight_layout()

                # Save the histogram
                output_file = os.path.join(analysis_folder, f"{transporto_tipas}_{degalai}_km_distribution.png")
                plt.savefig(output_file, dpi=300)
                plt.close()

plot_km_distribution_with_percentage(df, result_path + 'eu_analysis/', 'viso_nuvaziuota_km', 'transporto_priemones_tipas', 'degalai')


In [40]:
# Calculating the number of unique combinations of 'cleaned_marke' and 'cleaned_modelis'
unique_combinations_count = df[['cleaned_marke', 'cleaned_modelis']].drop_duplicates().shape[0]

print(f"Number of unique combinations of 'cleaned_marke' and 'cleaned_modelis' {unique_combinations_count}")


Number of unique combinations of 'cleaned_marke' and 'cleaned_modelis' 1091


In [24]:
# Mapping for features to proper English names
feature_mapping = {
    'kuro_sunaudojimas_l100km': 'Manufacturer Fuel Consumption (L/100km)',
    'kuro_sunaudojimas_l100km_org': 'Real World Fuel Consumption (L/100km)',
    'variklio_turis_cm3': 'Engine Displacement (cm³)',
    'galia_kw': 'Power (kW)',
    'nuosava_mase_kg': 'Actual Weight (kg)',
    'maksimali_mase_kg': 'Maximum Weight (kg)',
    'pagaminimo_metai': 'Year of Manufacture'
}

def calculate_and_plot_correlation(df, result_path, transporto_priemones_tipas_col, degalai_col, features, feature_mapping):
    """
    Calculates Spearman correlation by vehicle type and fuel type.
    Creates a heatmap for each combination and saves it as an image.

    :param df: DataFrame containing the data.
    :param result_path: Directory to save the results.
    :param transporto_priemones_tipas_col: Column indicating the vehicle type (e.g., K2, K6).
    :param degalai_col: Column indicating the fuel type (e.g., Benzinas, Dyzelinas).
    :param features: List of variables to include in the correlation calculations.
    :param feature_mapping: Dictionary mapping feature names to proper English names for plotting.
    """

    # Create the folder "correlation_analysis" if it does not already exist
    analysis_folder = os.path.join(result_path, 'correlation_analysis_after_outliers')
    os.makedirs(analysis_folder, exist_ok=True)

    # Unique combinations of vehicle types and fuel types
    unique_types = df[transporto_priemones_tipas_col].unique()
    unique_fuel_types = df[degalai_col].unique()

    # Iterate through each vehicle type
    for transporto_tipas in unique_types:
        # Filter the data by vehicle type
        type_df = df[df[transporto_priemones_tipas_col] == transporto_tipas]
        total_rows_type = len(type_df)

        for degalai in unique_fuel_types:
            # Filter the data by vehicle type and fuel type
            filtered_df = type_df[type_df[degalai_col] == degalai]

            # Check if there is sufficient data
            total_rows_fuel = len(filtered_df)
            if total_rows_fuel > 0:
                # Filter columns with at least one value
                valid_columns = [col for col in features if filtered_df[col].notna().sum() > 0]
                if len(valid_columns) > 1:
                    # Calculate the correlation matrix using the Spearman method
                    corr_matrix = filtered_df[valid_columns].corr(method='spearman')

                    # Rename columns and index for the heatmap
                    corr_matrix.rename(columns=feature_mapping, index=feature_mapping, inplace=True)

                    # Create the heatmap
                    plt.figure(figsize=(10, 8))
                    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt=".2f")
                    percentage = (total_rows_fuel / total_rows_type) * 100
                    plt.tight_layout()

                    # Save the heatmap as an image in the "correlation_analysis" folder
                    output_file = os.path.join(analysis_folder, f"{transporto_tipas}_{degalai}_correlation_heatmap_{total_rows_fuel}_({percentage:.2f}%).png")
                    plt.savefig(output_file, dpi=300)
                    plt.close()

# Example usage
features = ['kuro_sunaudojimas_l100km', 'kuro_sunaudojimas_l100km_org', 'variklio_turis_cm3', 'galia_kw', 'nuosava_mase_kg', 'maksimali_mase_kg', 'pagaminimo_metai']

calculate_and_plot_correlation(df, result_path + 'eu_analysis/', 'transporto_priemones_tipas', 'degalai', features, feature_mapping)


In [95]:
# Generate the summary statistics
summary = df[['kuro_sunaudojimas_l100km', 'kuro_sunaudojimas_l100km_org', 'variklio_turis_cm3', 'galia_kw', 'nuosava_mase_kg', 'maksimali_mase_kg', 'pagaminimo_metai']].describe()

# Convert to a DataFrame with formatted strings
formatted_summary = summary.astype(float).apply(lambda x: x.map('{:.2f}'.format))
print(formatted_summary)


      kuro_sunaudojimas_l100km kuro_sunaudojimas_l100km_org  \
count               1690667.00                   1865741.00   
mean                      5.58                         7.81   
std                       1.90                         2.52   
min                       0.60                         3.00   
25%                       4.90                         6.20   
50%                       5.60                         7.26   
75%                       6.50                         8.71   
max                      16.40                        43.48   

      variklio_turis_cm3    galia_kw nuosava_mase_kg maksimali_mase_kg  \
count         1865731.00  1858647.00      1761576.00        1865413.00   
mean             1672.86      114.62         1563.15           1726.30   
std               528.36       49.78          340.22            389.99   
min               875.00       44.00          915.00            992.00   
25%              1332.00       81.00         1315.00          

In [96]:
df.degalai.value_counts()

Unnamed: 0_level_0,count
degalai,Unnamed: 1_level_1
Benzinas,1025573
Dyzelinas,575492
Benzinas_H,157445
Dyzelinas_H,74942
Etanolis,19544
Suskystintos naftos dujos,11144
Biometanas,1430
Gamtines dujos,168
Elektra,3


In [97]:
cars_data = df[df['eu_cars'] == 1].copy()
vans_data = df[df['eu_vans'] == 1].copy()

# Update 'degalai' values in cars_data
cars_data['degalai'] = cars_data['degalai'].replace({
    'Dyzelinas_H': 'Dyzelinas',
    'Benzinas_H': 'Benzinas'
})

# Update 'degalai' values in vans_data
vans_data['degalai'] = vans_data['degalai'].replace({
    'Dyzelinas_H': 'Dyzelinas',
    'Benzinas_H': 'Benzinas'
})

comparison_summary = create_comparison_summary(cars_data, result_path + 'eu_analysis/', 'Cars')
comparison_summary = create_comparison_summary(vans_data, result_path + 'eu_analysis/', 'Vans')


In [19]:
# Save the combined dataframe to a CSV file
df.to_csv(main_path + 'eu_total_final_full.csv', index=False)

In [18]:
# Save the combined dataframe to a CSV file just Dyzelinas and Benzinas vehicles
df1 = df[df['degalai'].isin(['Dyzelinas', 'Benzinas'])]
df1.to_csv(main_path + 'eu_total_final.csv', index=False)

In [25]:
# Final dataset size
df1.shape

(1602327, 41)

In [28]:
# Describe information for each fuel type and vehicle type on milleage data
# Group by 'degalai' and 'transporto_priemones_tipas' and calculate descriptive statistics
grouped_stats = df1.groupby(['degalai', 'transporto_priemones_tipas'])['viso_nuvaziuota_km'].describe()

# Display the grouped statistics
print(grouped_stats)



                                          count          mean           std  \
degalai   transporto_priemones_tipas                                          
Benzinas  K2                          1022762.0  11905.279039   9046.421912   
          K6                             3739.0   3162.424285   4395.457644   
Dyzelinas K2                           478057.0  22612.608946  15114.210421   
          K6                            97769.0   6961.377586   8726.102903   

                                      min      25%      50%        75%  \
degalai   transporto_priemones_tipas                                     
Benzinas  K2                          5.0   4820.2   9930.9  16772.775   
          K6                          5.0      9.3    631.7   5294.000   
Dyzelinas K2                          5.8  10930.3  21027.0  30636.000   
          K6                          5.0     16.0   2838.8  12050.100   

                                          max  
degalai   transporto_priemones_t