Script file main.py

In [None]:
import streamlit as st
import pandas as pd
from modules.data_processing import *

# Set the page title
st.set_page_config(page_title="Global Temperature Data Analysis", layout="wide")


# Introduction to the project
st.title("Welcome to the Global Temperature Data Analysis Project")
st.markdown("""
    This project allows you to explore temperature data for various global regions, including cities, countries, and states.
    The data spans multiple years, and you can filter the data based on the year and region of interest.
""")

# Sidebar for dataset selection
dataset_option = st.sidebar.radio(
    "Select a dataset to explore:",
    options=["major_city", "state", "country", "global_temp_country", "city"]
)

# Load the selected dataset
st.write(f"Displaying data from the **{dataset_option}** dataset")
data = load_data(dataset_option) 
data = data.dropna() # Drop rows with missing values
data = convert_to_datetype(data, 'dt')
col1, col2 = st.columns([3,1])
if data is not None:
    # Show the first few rows of the selected dataset
    st.write("### Dataset Preview:")
    with col1:
        st.write(data)
    with col2:
        st.write(data.dtypes)

    # Get the year range for the dataset
    min_year, max_year = get_year_range(data, 'dt')
    
    # Slider to select the year
    selected_year = year_slider(min_year, max_year)
    st.write(f"### Data for the year {selected_year}:")
    
    # Filter data for the selected year
    data_filtered_by_year = filter_data_by_year(data, 'dt', selected_year[0], selected_year[1])
    
    # Show filtered data for the selected year
    st.write(data_filtered_by_year)

else:
    st.write("### Error: Unable to load the selected dataset.")

Function to load dataset based on the selected table name
       function used in 01_table.py, 02_map.py, 03_route.py

In [None]:
def load_data(table_name):
    if table_name == "country":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByCountry.csv')
    elif table_name == "city":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByCity.csv')
    elif table_name == "major_city":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByMajorCity.csv')
    elif table_name == "state":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByState.csv')
    elif table_name == "global_temp_country":
        return pd.read_csv('./dataset/GlobalTemperatures.csv')
    else:
        return None

Function to convert a column to datetime

In [None]:
def convert_to_datetype(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
    return df


    Get the range of years from a date column.
    
    Parameters:
    - df: DataFrame containing the data.
    - date_column: The name of the column containing the dates.
    
    Returns:
    - min_year: The first (minimum) year in the dataset.
    - max_year: The last (maximum) year, the nearest to ours days.


In [None]:
def get_year_range(df, date_column):
   
    min_year = df[date_column].min().year
    max_year = df[date_column].max().year
    return min_year, max_year


    Restituisce la lista delle città selezionate dall'utente nel multiselect.
    Se l'utente non seleziona altre città, restituisce le due città predefinite.

    Parameters:
    - data: DataFrame contenente i dati delle città.
    - default_place: Lista delle due città predefinite (di default 'New York' e 'Los Angeles').

    Returns:
    - place_selected: Lista delle città selezionate dall'utente.

In [None]:
def multieselector_place(data, colum_name, default_place=['New York', 'Los Angeles']):

    # Multiselect per selezionare le città, con due città di default
    place_selected = st.multiselect(
        "Seleziona le città",
        options=data[colum_name].unique(),
        default=default_place  # Impostiamo due città di default
    )

    # Se non è stata selezionata nessuna città, usiamo le città predefinite
    if len(place_selected) == 0:
        st.warning("Per favore, seleziona almeno una città.")  # Mostra un avviso se nessuna città è selezionata
        place_selected = default_place  # Se nessuna città è selezionata, rimpiazziamo con le città di default

    return place_selected

    Filters the data based on a selected year range.
    
    Parameters:
    - df: DataFrame containing the data.
    - date_column: The name of the column containing the dates.
    - min_year: The minimum year of the range.
    - max_year: The maximum year of the range.
    
    Returns:
    - df_filtered: The filtered DataFrame based on the selected year range.

In [None]:
def filter_data_by_year(df, date_column, min_year, max_year):
    # Ensure the 'date_column' is in datetime format
    if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
        df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    # Filter the data based on the year range
    df_filtered = df[(df[date_column].dt.year >= min_year) & (df[date_column].dt.year <= max_year)]    
    return df_filtered

In [None]:
def descriptive_stats(data, temp_column):
    """
    Calculates descriptive statistics for a temperature column in a DataFrame.
    Also adds the temperature range (max - min).

    Parameters:
    - data: DataFrame containing the data.
    - temp_column: The name of the column containing the temperatures.

    Returns:
    - stats: Dictionary containing the descriptive statistics (min, max, mean, median, std, range).
    """
    # Remove NaN values
    temperatures = data[temp_column].dropna()

    # Calculate descriptive statistics
    stats = {
        "Min": np.min(temperatures), 
        "Max": np.max(temperatures),
        "Range": np.max(temperatures) - np.min(temperatures),
        "Mean": np.mean(temperatures),
        "Q1": np.percentile(temperatures, 25),
        "Median": np.median(temperatures),
        "Q3": np.percentile(temperatures, 75),
        "Std Dev": np.std(temperatures),
        "IQR": np.percentile(temperatures, 75) - np.percentile(temperatures, 25),
        "Number of Observations": len(temperatures),
        "Variance": np.var(temperatures),
        "Coefficient of Variation": np.std(temperatures) / np.mean(temperatures),
    }

    return stats

In [None]:
def generate_stats_df(filtered_data, place_selected, column_name, temp_column='AverageTemperature'):
    """
    Generates a DataFrame with descriptive statistics for the selected places.
    
    Parameters:
    - filtered_data: The DataFrame containing the filtered data.
    - place_selected: The list of places (cities, countries, etc.) for which the statistics are generated.
    - column_name: The column name that identifies the places (e.g., 'City').
    - temp_column: The column name containing the temperature data (default 'AverageTemperature').
    
    Returns:
    - stats_df: The DataFrame with descriptive statistics for the selected places.
    """
    
    stats_data = []
    for place in place_selected:
        # Filter the data for each place
        place_data = filtered_data[filtered_data[column_name] == place]
        
        # Calculate descriptive statistics for the place
        place_stats = descriptive_stats(place_data, temp_column)# function from operations.py
        
        # Add the name of the place to the statistics
        place_stats[column_name] = place
        
        # Add the calculated statistics to the list
        stats_data.append(place_stats)

    # Create the DataFrame with the descriptive statistics
    stats_df = pd.DataFrame(stats_data)

    # Set the 'column_name' (e.g., 'City') as the index
    stats_df.set_index(column_name, inplace=True)
    
    return stats_df

In [None]:
def year_slider(min_year, max_year):
    """
    Creates a slider to select a range of years.
    
    Parameters:
    - min_year: The minimum year.
    - max_year: The maximum year.
    
    Returns:
    - selected_year_range: The selected year range from the slider.
    """
    selected_year_range = st.slider(
        "Select a range of years",
        min_value=min_year,
        max_value=max_year,
        value=(min_year, max_year),
        step=1  # Set the step value to 1
    )
    return selected_year_range

In [None]:
def filter_data_by_year(df, date_column, min_year, max_year):
    """
    Filters the data based on a selected year range.
    
    Parameters:
    - df: DataFrame containing the data.
    - date_column: The name of the column containing the dates.
    - min_year: The minimum year of the range.
    - max_year: The maximum year of the range.
    
    Returns:
    - df_filtered: The filtered DataFrame based on the selected year range.
    """
    # Ensure the 'date_column' is in datetime format
    if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
        df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    # Filter the data based on the year range
    df_filtered = df[(df[date_column].dt.year >= min_year) & (df[date_column].dt.year <= max_year)]    
    return df_filtered

In [None]:
def display_chart(filtered_data, place_selected, place_column='City', temp_column='AverageTemperature'):
    """
    Function to display the selected chart: Line Chart or Histogram.
    
    Parameters:
    - filtered_data: DataFrame containing the filtered data.
    - place_selected: List of selected cities or countries to display data for.
    - place_column: The column containing the geographic entities (default 'City').
    - temp_column: The column containing the temperature data (default 'AverageTemperature').
    """
    # Radio button to select the type of chart
    chart_type = st.radio(
        "Select the chart type:",
        options=["Line Chart", "Histogram"],
        horizontal=True  # To arrange the options horizontally
    )
    
    # Display the chart based on selection
    if chart_type == "Line Chart":
        # Show the line chart
        plt = plot_temperature_trends(filtered_data, place_selected, place_column, temp_column)
        st.pyplot(plt)

    elif chart_type == "Histogram":
        # Show the histogram of top 7 places with the largest temperature ranges
        fig = plot_temperature_range_histogram(filtered_data, place_column, temp_column)
        st.pyplot(fig)


In [None]:

def plot_temperature_trends(data, places_selected, place_column='City', temp_column='AverageTemperature'):
    """
    Function to plot the line chart of average temperatures over time for the selected entities 
    (cities, countries, states, etc.).
    
    Parameters:
    - data: DataFrame containing the filtered data.
    - places_selected: The list of selected entities (cities, countries, states, etc.).
    - place_column: The name of the column containing the geographic entities (default 'City').
    - temp_column: The name of the column containing the temperatures (default 'AverageTemperature').

    Returns:
    - plt: The plot object to be displayed in Streamlit.
    """
    plt.figure(figsize=(12, 6))
    
    # Iterate over each selected entity
    for place in places_selected:
        place_data = data[data[place_column] == place]
        
        # Group the data by year and calculate the average temperature for each year
        place_data['Year'] = place_data['dt'].dt.year
        annual_temp = place_data.groupby('Year')[temp_column].mean().reset_index()
        
        # Create the line chart for each entity without markers
        plt.plot(annual_temp['Year'], annual_temp[temp_column], 
                 label=place, linewidth=2)  # Without markers

    # Add title, labels, and improve the visual layout
    plt.title(f"Temperature Trends for the Selected {place_column.capitalize()}s", fontsize=16, fontweight='bold')
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Average Temperature (°C)', fontsize=12)
    plt.legend(title=place_column.capitalize(), fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)  # Light grid for better visibility
    plt.xticks(rotation=45)
    plt.tight_layout()  # Improve the arrangement of elements
    return plt




In [None]:
def plot_temperature_range_histogram(data, places_column='City', period_column='dt', temp_column='AverageTemperature'):
    """
    Function to display a histogram of the top 10 entities (cities, countries, states, etc.) 
    with the largest temperature ranges. If there are fewer than 10 entities, it shows all available entities.
    
    Parameters:
    - data: DataFrame containing the data of entities with temperature information.
    - places_column: The column that represents the geographic entity (default 'City').
    - period_column: The column that represents the period (default 'dt').
    - temp_column: The column containing temperature data (default 'AverageTemperature').

    Returns:
    - fig: The figure of the histogram.
    """
    temperature_ranges = []

    # Convert the period column to datetime format if it's not already
    if not pd.api.types.is_datetime64_any_dtype(data[period_column]):
        data[period_column] = pd.to_datetime(data[period_column], errors='coerce')

    # Get the unique entities (cities, countries, states, etc.)
    places_available = data[places_column].unique()

    for place in places_available:
        place_data = data[data[places_column] == place]
        
        # Group the data by year and calculate the max and min temperature for each entity
        place_data['Year'] = place_data[period_column].dt.year
        place_range = place_data.groupby('Year')[temp_column].agg(['max', 'min'])
        
        # Calculate the temperature range for each entity (max - min)
        place_range['TemperatureRange'] = place_range['max'] - place_range['min']
        
        # Add the maximum temperature range for the entity
        temperature_ranges.append({
            places_column: place,
            'TemperatureRange': place_range['TemperatureRange'].max()  # Use the maximum range
        })

    # Create a DataFrame with the results
    temp_range_df = pd.DataFrame(temperature_ranges)

    # Sort the entities by temperature range in descending order
    temp_range_df_sorted = temp_range_df.sort_values(by='TemperatureRange', ascending=False)

    # If there are fewer than 10 entities, select all available entities
    top_n = min(10, len(temp_range_df_sorted))
    temp_range_df_sorted = temp_range_df_sorted.head(top_n)

    # Create the histogram
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(temp_range_df_sorted[places_column], temp_range_df_sorted['TemperatureRange'], color='skyblue')

    # Add title and labels
    ax.set_title(f'Top {top_n} {places_column.capitalize()}s with the Largest Temperature Range', fontsize=16)
    ax.set_xlabel(places_column.capitalize(), fontsize=12)
    ax.set_ylabel('Temperature Range (°C)', fontsize=12)
    ax.grid(True, linestyle='--', alpha=0.7)
    plt.xticks(rotation=45)
    
    return fig  # Returns the figure to be displayed in Streamlit


In [None]:
import streamlit as st
import pandas as pd
from modules.data_processing import *
from modules.visualization import *
from modules.operations import *

# Set the page title
st.set_page_config(page_title="Dataset Table", layout="wide")
st.title("TABLE OF GLOBAL TEMPERATURE DATA")

# Carica il dataset delle città
dataset_option = st.sidebar.radio(
    "Seleziona un dataset da visualizzare",
    options=["major_city", "state", "country", "city"]
)

# Carica il dataset
data = load_data(dataset_option)

if dataset_option == "major_city" or dataset_option == "city":
    st.write(f"Dataset: {dataset_option}")
    
    # Converte la colonna 'dt' in formato datetime
    data = convert_to_datetype(data, 'dt')

    # Ottieni l'intervallo degli anni
    min_year, max_year = get_year_range(data, 'dt')

    # Ottieni la lista delle città selezionate, con le due città di default
    cities_selected = multieselector_place(data, 'City', default_place=['New York', 'Los Angeles'])

    # Filtro dei dati in base alle città selezionate
    filtered_data_for_desc = filter_data_by_year(data, 'dt', min_year, max_year)
    filtered_data_for_desc = filtered_data_for_desc[filtered_data_for_desc['City'].isin(cities_selected)]

    # Calcola le statistiche descrittive per le città selezionate
    stats_df = generate_stats_df(filtered_data_for_desc, cities_selected, 'City', 'AverageTemperature')

    # Mostra la tabella delle statistiche descrittive con i nomi delle città come indice
    st.write("Statistiche Descrittive delle Temperature per le Città Selezionate:")
    st.dataframe(stats_df)

    # Range slider per selezionare un intervallo di anni
    selected_year_range = year_slider(min_year, max_year)

    # Filtra i dati in base all'intervallo di anni selezionato
    filtered_data = filter_data_by_year(data, 'dt', selected_year_range[0], selected_year_range[1])

    # Mostra il grafico in base alla selezione
    # Per tracciare il grafico selezionato per le città
    display_chart(filtered_data, place_selected=cities_selected, place_column='City', temp_column='AverageTemperature')



elif dataset_option == "state" or dataset_option == "country":
    st.write(f"Dataset: {dataset_option}")
    
    # Converte la colonna 'dt' in formato datetime
    data = convert_to_datetype(data, 'dt')

    # Ottieni l'intervallo degli anni
    min_year, max_year = get_year_range(data, 'dt')

    # Ottieni la lista delle città selezionate, con le due città di default
    countries_selected = multieselector_place(data, 'Country', default_place=['United States', 'Canada'])

    # Filtro dei dati in base alle città selezionate
    filtered_data = filter_data_by_year(data, 'dt', min_year, max_year)
    filtered_data = filtered_data[filtered_data['Country'].isin(countries_selected)]

    # Calcola le statistiche descrittive per le città selezionate
    stats_df = generate_stats_df(filtered_data, countries_selected, 'Country')

    # Mostra la tabella delle statistiche descrittive con i nomi delle città come indice
    st.write("Statistiche Descrittive delle Temperature per le Città Selezionate:")
    st.dataframe(stats_df)

    # Range slider per selezionare un intervallo di anni
    selected_year_range = year_slider(min_year, max_year)

    # Filtra i dati in base all'intervallo di anni selezionato
    filtered_data = filter_data_by_year(data, 'dt', selected_year_range[0], selected_year_range[1])

    # Mostra il grafico in base alla selezione
    # Per tracciare il grafico selezionato per le città
    # Per tracciare il grafico selezionato per i paesi
    display_chart(filtered_data, place_selected=countries_selected, place_column='Country', temp_column='AverageTemperature')



else:
    st.write("Dataset non valido o assente.")


MAP PART
the algorithm below represents those that make this map appear

In [None]:
def load_data(table_name):
    if table_name == "country":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByCountry.csv')
    elif table_name == "city":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByCity.csv')
    elif table_name == "major_city":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByMajorCity.csv')
    elif table_name == "state":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByState.csv')
    elif table_name == "global_temp_country":
        return pd.read_csv('./dataset/GlobalTemperatures.csv')
    else:
        return None

In [None]:
def convert_to_datetype(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
    return df

In [None]:
def get_year_range(df, date_column):

    min_year = df[date_column].min().year
    max_year = df[date_column].max().year
    return min_year, max_year

In [None]:
def year_monoslider(min_year=1760, max_year=2011):
    selected_year = st.slider(
        "Select a year",
        min_value=min_year,
        max_value=max_year,
        value=2002
    )
    return selected_year

In [None]:
def filter_data_by_oneyear(df, date_column, year):
    df_filtered = df[df[date_column].dt.year == year]
    return df_filtered

In [None]:
def add_average_annual_temperature(df, date_column, temperature_column, temperature_column_name="Average_annual_temperature"):
    #Extract the year from the date_column and add it as a new column 'Year' 
    df['Year'] = df[date_column].dt.year
    #Calculate the average temperature per city and year 
    avg_temp_per_city = df.groupby(['City', 'Year'])[temperature_column].mean().reset_index()
    #Rename the calculated average temperature column 
    avg_temp_per_city = avg_temp_per_city.rename(columns={temperature_column: temperature_column_name})
    #Merge the calculated averages back into the original DataFrame
    df = df.merge(avg_temp_per_city, on=['City', 'Year'], how='right')
    #Drop the 'Year' column, as it is no longer needed 
    df = df.drop(columns=['Year'])
    return df

In [None]:
def convert_coordinate(coord):
    """
    Converts a coordinate with a direction (N/S/E/W) to a numeric format.
    
    Parameters:
    - coord: The coordinate string that may include a direction (e.g., '40.7128N', '74.0060W').
    
    Returns:
    - A float value representing the numeric coordinate, with negative values for South and West.
    """
    if 'N' in coord or 'E' in coord:
        return float(coord[:-1])  # If it's North or East, just remove the direction and convert to float
    elif 'S' in coord or 'W' in coord:
        return -float(coord[:-1])  # If it's South or West, remove the direction and make it negative
    return float(coord)  # If there is no direction, just return the float of the coordinate

In [None]:
def add_color_column_with_hex(df, temperature_column='Average_annual_temperature', colormap_name="coolwarm"):
    """
    Adds a column with HEX color values based on the temperature using the specified colormap.
    
    Parameters:
    - df: The DataFrame to which the color column is added.
    - temperature_column: The column name that contains the temperature data (default 'Average_annual_temperature').
    - colormap_name: The colormap to use for mapping temperatures to colors (default "coolwarm").
    
    Returns:
    - df: The DataFrame with the added 'Color_hex' column containing HEX color values.
    """
    # Get the colormap and normalize the temperature data
    colormap = plt.get_cmap(colormap_name)
    norm = plt.Normalize(df[temperature_column].min(), df[temperature_column].max())
    # Apply the colormap to the temperature values and convert to HEX
    df["Color_hex"] = df[temperature_column].apply(
        lambda temp: "#{:02x}{:02x}{:02x}".format(
            int(colormap(norm(temp))[0] * 255),  # Red
            int(colormap(norm(temp))[1] * 255),  # Green
            int(colormap(norm(temp))[2] * 255)   # Blue
        )
    )
    return df

In [None]:
def get_unique_city_data(df):
    unique_cities = df.drop_duplicates(subset=['City'])
    return unique_cities

In [None]:
def create_map_with_markers(df):
    # Filter out rows where Average_annual_temperature is NaN
    df = df.dropna(subset=['Average_annual_temperature'])#I noticed that, there was some point where the temperature was not available

    # Create the map centered on the mean latitude and longitude of the cities
    m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=2)

    # Add a marker for each city
    for _, column in df.iterrows():
        popup_text = f"""
        <b>City:</b> {column['City']}<br>
        <b>Country:</b> {column['Country']}<br>
        <b>Temperature:</b> {column['Average_annual_temperature']}°C<br>
        <b>Coordinates:</b> ({column['Latitude']}, {column['Longitude']})
        """
        folium.CircleMarker(
            location=[column['Latitude'], column['Longitude']],
            radius=8,
            color=column['Color_hex'],
            fill=True,
            fill_color=column['Color_hex'],
            fill_opacity=0.8,
            popup=folium.Popup(popup_text, max_width=200)
        ).add_to(m)  # Add the marker to the map

    return m  # Return the map created

def display_map(df):
    map_with_markers = create_map_with_markers(df)
    st_folium(map_with_markers, width=1200, height=800)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import streamlit as st
from streamlit_folium import st_folium
from modules.data_processing import *
from modules.operations import *
from modules.visualization import *

# Set the page title
st.set_page_config(page_title="Map", layout="wide")
st.title("MAP OF GLOBAL TEMPERATURE DATA")

# Streamlit app interface
selected_table = st.selectbox(
    "Select a table",
    ["major_city", "city"]
)

st.write(f"Displaying data from {selected_table} table")

# Load the dataset
dataframe_selected = load_data(selected_table)

# Convert 'dt' column to datetime
dataframe_selected = convert_to_datetype(dataframe_selected, "dt")

# Get the range of years from the dataset
min_year, max_year = get_year_range(dataframe_selected, "dt")

# Year slider to select the desired year
selected_year = year_monoslider(min_year, max_year)

# Filter the data by the selected year
dataframe_filtered_by_year = filter_data_by_oneyear(dataframe_selected, "dt", selected_year)

# Add the average annual temperature to the filtered data
dataframe_filtered_by_year = add_average_annual_temperature(dataframe_filtered_by_year, "dt", "AverageTemperature")

# Convert latitude and longitude to numeric coordinates
dataframe_filtered_by_year = dataframe_filtered_by_year.dropna(subset=['Latitude', 'Longitude'])
dataframe_filtered_by_year['Latitude'] = dataframe_filtered_by_year['Latitude'].apply(convert_coordinate)
dataframe_filtered_by_year['Longitude'] = dataframe_filtered_by_year['Longitude'].apply(convert_coordinate)


dataframe_filtered_by_year = add_color_column_with_hex(dataframe_filtered_by_year)
# Get unique city data (removes duplicates)
dataframe_filtered_by_year = get_unique_city_data(dataframe_filtered_by_year)



# Create the map with the markers
display_map(dataframe_filtered_by_year)



# Disclaimer note
st.write(":red[DISCLAIMER: The coordinates are not accurate.]")


In [None]:
def load_data(table_name):
    if table_name == "country":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByCountry.csv')
    elif table_name == "city":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByCity.csv')
    elif table_name == "major_city":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByMajorCity.csv')
    elif table_name == "state":
        return pd.read_csv('./dataset/GlobalLandTemperaturesByState.csv')
    elif table_name == "global_temp_country":
        return pd.read_csv('./dataset/GlobalTemperatures.csv')
    else:
        return None

In [None]:
def convert_to_datetype(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
    return df

In [None]:
def year_monoslider(min_year=1760, max_year=2011):
    selected_year = st.slider(
        "Select a year",
        min_value=min_year,
        max_value=max_year,
        value=2002
    )
    return selected_year

In [None]:
def month_slider(min_month, max_month):
    selected_month = st.slider(
        "Select a month",
        min_value=min_month,
        max_value=max_month,
        value=9
    )
    return selected_month

In [None]:
def get_month_range(df, date_column, year):
    df_year = df[df[date_column].dt.year == year]
    min_month = df_year[date_column].min().month
    max_month = df_year[date_column].max().month
    return min_month, max_month

In [None]:
def filter_data_by_year_month(df, date_column, year, month, temperature_column='AverageTemperature'):
    df_filtered = df[(df[date_column].dt.year == year) & (df[date_column].dt.month == month)]
    df_filtered = df_filtered.dropna(subset=[temperature_column])
    return df_filtered

In [None]:
def convert_coordinate(coord):
    if 'N' in coord or 'E' in coord:
        return float(coord[:-1])  # If it's North or East, just remove the direction and convert to float
    elif 'S' in coord or 'W' in coord:
        return -float(coord[:-1])  # If it's South or West, remove the direction and make it negative
    return float(coord)  # If there is no direction, just return the float of the coordinate

In [None]:
# Function to create a GeoDataFrame with city geometries
def create_geodf(df, lat_col='Latitude', lon_col='Longitude'):
    geometry = [Point(xy) for xy in zip(df[lon_col], df[lat_col])]
    geo_data = gpd.GeoDataFrame(df, geometry=geometry)
    return geo_data

In [None]:
def city_selector(df, column_name, phrase, exclude_city=None):
    cities = df[column_name].unique()
    if exclude_city:
        cities = [city for city in cities if city != exclude_city]  #This line uses a list comprehension to create a new list of cities. Here's how it works:
                                                                    # city for city in cities: This part iterates over each city in the original cities list.
                                                                    #if city != exclude_city: This condition filters out any city that matches the exclude_city value.
    selected_city = st.selectbox(phrase, cities)
    return selected_city

In [None]:
def find_path(geo_data, start_city, end_city, w_T=0.4, w_D=0.6):
    """
    Finds the optimal path between two cities based on a score that combines
    average temperature and distance to the destination.
    """
    # Find the start and end points
    start = geo_data[geo_data['City'] == start_city]
    end = geo_data[geo_data['City'] == end_city]

    if start.empty or end.empty:
        raise ValueError("One of the specified cities is not in the dataset.")

    # Initialize the path and current point
    path = [start_city]
    current = start

    while current['City'].values[0] != end_city:
        # Calculate the distance from all cities that have not been visited yet
        remaining = geo_data[~geo_data['City'].isin(path)]

        # Calculate the distance from the current point
        remaining['distance_from_current'] = remaining.geometry.distance(current.geometry.values[0])

        # Select the three closest cities
        closest = remaining.nsmallest(3, 'distance_from_current')

        # Check if the destination city is among the closest cities
        if end_city in closest['City'].values:
            path.append(end_city)
            break

        # Calculate the distance to the destination for the score
        closest['distance_to_dest'] = closest.geometry.distance(end.geometry.values[0])

        # Calculate the combined score
        closest['score'] = (
            w_T * closest['AverageTemperature'] - w_D * closest['distance_to_dest']
        )

        # Choose the city with the highest score
        next_city = closest.loc[closest['score'].idxmax()]

        # Add the selected city to the path
        path.append(next_city['City'])
        current = geo_data[geo_data['City'] == next_city['City']]
    return path


In [None]:
def visualize_path(geo_data, path, start_city, end_city):
    """
    Visualizes the calculated path on a Folium map.
    - The cities in the path are highlighted and connected by a line.
    - Cities not visited are shown with lower opacity and include the temperature in the popup.
    """
    # Create a map centered on the starting city
    start_coords = geo_data[geo_data['City'] == start_city].geometry.values[0]
    m = folium.Map(location=[start_coords.y, start_coords.x], zoom_start=3)

    # Add all cities with low opacity and show temperature in the popup
    for _, row in geo_data.iterrows():
        folium.CircleMarker(
            location=[row.geometry.y, row.geometry.x],
            radius=4,
            color="gray",
            fill=True,
            fill_color="gray",
            fill_opacity=0.3,
            popup=f"{row['City']}<br>Temp: {row['AverageTemperature']:.2f}°C",
        ).add_to(m)

    # Highlight the cities in the path
    for city in path:
        city_data = geo_data[geo_data['City'] == city]
        city_coords = city_data.geometry.values[0]
        temperature = city_data['AverageTemperature'].values[0]
        folium.CircleMarker(
            location=[city_coords.y, city_coords.x],
            radius=6,
            color="blue",
            fill=True,
            fill_color="blue",
            fill_opacity=0.6,
            popup=f"{city}<br>Temp: {temperature:.2f}°C",
        ).add_to(m)

    # Connect the cities in the path with a line
    path_coords = [
        [geo_data[geo_data['City'] == city].geometry.y.values[0],
         geo_data[geo_data['City'] == city].geometry.x.values[0]]
        for city in path
    ]
    folium.PolyLine(path_coords, color="blue", weight=2.5, opacity=0.6).add_to(m)

    # Highlight the start city in green and the end city in red
    start_data = geo_data[geo_data['City'] == start_city]
    start_coords = start_data.geometry.values[0]
    start_temp = start_data['AverageTemperature'].values[0]
    folium.CircleMarker(
        location=[start_coords.y, start_coords.x],
        radius=8,
        color="green",
        fill=True,
        fill_color="green",
        fill_opacity=1,
        popup=f"Start: {start_city}<br>Temp: {start_temp:.2f}°C",
    ).add_to(m)

    end_data = geo_data[geo_data['City'] == end_city]
    end_coords = end_data.geometry.values[0]
    end_temp = end_data['AverageTemperature'].values[0]
    folium.CircleMarker(
        location=[end_coords.y, end_coords.x],
        radius=8,
        color="red",
        fill=True,
        fill_color="red",
        fill_opacity=1,
        popup=f"End: {end_city}<br>Temp: {end_temp:.2f}°C",
    ).add_to(m)

    return m

def display_optimal_path_map(filtered_df, path, start, arrive):
    st.header("Map of the optimal path")
    map = visualize_path(filtered_df, path, start, arrive)
    st_folium(map, width=1200, height=800)

In [None]:
import streamlit as st
from streamlit_folium import st_folium
from modules.data_processing import *
from modules.visualization import *
from modules.operations import *

# Set the page title
st.set_page_config(page_title="Route", layout="wide")
st.title("OPTIMAL ROUTE BETWEEN TWO CITIES")

selected_table = st.selectbox(
    "Select a table",
    ["major_city", "city"]
)
col1, col2 = st.columns(2)
selected_df = load_data(selected_table)


selected_table = convert_to_datetype(selected_df, "dt")


with col1:
    selected_year = year_monoslider()


min_month, max_month = get_month_range(selected_table, "dt", selected_year)
with col2:
    selected_month = month_slider(min_month, max_month)

filtered_df = filter_data_by_year_month(selected_table, "dt", selected_year, selected_month)

filtered_df = filtered_df.dropna(subset=['Latitude', 'Longitude'])
filtered_df['Latitude'] = filtered_df['Latitude'].apply(convert_coordinate)
filtered_df['Longitude'] = filtered_df['Longitude'].apply(convert_coordinate)

filtered_df = create_geodf(filtered_df, 'Latitude', 'Longitude')

with col1:
    start = city_selector(filtered_df, 'City', "select the start city")

with col2:
    arrive = city_selector(filtered_df, 'City', "select the arrive city", start)


path= find_path(filtered_df, start, arrive)


display_optimal_path_map(filtered_df, path, start, arrive)