In [21]:
from utils.data_loader import load_data_parquet
from functions.data_by_country import data_by_country
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.seasonal import seasonal_decompose
import pandas as pd
import plotly.graph_objs as go
import numpy as np
import plotly.express as px
from functions.kmeans import KMeans
from functions.one_class_svm import OneClassSVM

In [22]:
df = load_data_parquet()
df2 = data_by_country(df)
df2 = df2.dropna()
df2 = df2.pivot(index="dt", columns="Country",
              values='AverageTemperature').dropna()





In [24]:
df

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,Country_ISO
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E,DNK
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E,DNK
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E,DNK
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E,DNK
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E,DNK
...,...,...,...,...,...,...,...,...
8599207,2013-05-01,11.464,0.236,Zwolle,Netherlands,52.24N,5.26E,NLD
8599208,2013-06-01,15.043,0.261,Zwolle,Netherlands,52.24N,5.26E,NLD
8599209,2013-07-01,18.775,0.193,Zwolle,Netherlands,52.24N,5.26E,NLD
8599210,2013-08-01,18.025,0.298,Zwolle,Netherlands,52.24N,5.26E,NLD


In [3]:
trend_components = {}
slopes = {}
for country in df2.columns:
    result = seasonal_decompose(df2[country], model='additive', period=12)
    trend_components[country] = result.trend.dropna()
all_trends = pd.concat(trend_components, axis=1)

In [4]:
scaler = MinMaxScaler()
all_normalized_trends = pd.DataFrame(scaler.fit_transform(
    all_trends), index=all_trends.index, columns=all_trends.columns)

In [5]:
# Calculate the slope based on the normalized trend
for country in all_normalized_trends.columns:
    x_data = range(len(all_normalized_trends[country]))
    y_data = all_normalized_trends[country].dropna().values

    slope, _ = np.polyfit(x_data, y_data, 1)
    slopes[country] = slope

slopes_df = pd.DataFrame(list(slopes.items()), columns=['Country', 'Slope'])
slopes_df = slopes_df.sort_values(by='Slope', ascending=False)

In [6]:
slopes_arr = np.array(slopes_df['Slope']).reshape(-1, 1)

# Applying K-means clustering
n_clusters = 3 
kmeans = KMeans(k=n_clusters)
kmeans.fit(slopes_arr)

# Assigning cluster labels to the DataFrame
slopes_df['Cluster'] = kmeans.labels_
slopes_df = slopes_df.sort_values(by='Slope', ascending=False)

In [7]:
slopes_df = slopes_df.merge(df[['Country', 'Country_ISO']].drop_duplicates(), on='Country', how='left')

In [8]:
trend_graph = go.Figure()
selected_countries = ['Indonesia', 'Peru']
for country in selected_countries:
    x_data = range(len(all_normalized_trends))
    y_data = all_normalized_trends[country].dropna()

    slope, intercept = np.polyfit(x_data, y_data, 1)
    reg_line = slope * np.array(x_data) + intercept

    trend_graph.add_trace(
        go.Scatter(x=all_normalized_trends.index, y=y_data, mode='lines', name=country)
    )
    trend_graph.add_trace(
        go.Scatter(x=all_normalized_trends.index, y=reg_line, mode='lines',
                   name=f"{country} (Linear Fit)", line=dict(dash='dash'))
    )

trend_graph.update_layout(title="Normalized Trend Component with Linear Regression for Selected Countries",
                          xaxis=dict(title='Year'), yaxis=dict(title='Normalized Temperature Trend'))

# Graph 2: Growth Rates for Each Country by Cluster

fig_slopes_colored = px.bar(
    slopes_df,
    x='Country',
    y='Slope',
    color='Cluster',
    title='Sorted Slopes of Countries with Cluster Colors'
)

fig_slopes_colored.show()

fig = px.choropleth(
    slopes_df,
    locations="Country_ISO",
    color="Cluster",
    hover_name="Country",
    color_continuous_scale=px.colors.sequential.Viridis,
    title="Clusters by Country"
)

fig.show()

trend_graph.show()

In [16]:
from sklearn.model_selection import GridSearchCV

# Define parameter range
param_grid = {'gamma': [0.001, 0.01, 0.1, 0.5, 1],
              'nu': [0.01, 0.05, 0.1, 0.2, 0.3]}

grid = GridSearchCV(OneClassSVM(kernel='rbf'), param_grid, refit=True, cv=5)

# Fit the model
grid.fit(all_normalized_trends["Indonesia"].values.reshape(-1, 1))

# Print the best parameters
print("Best Parameters: ", grid.best_params_)


Best Parameters:  {'gamma': 0.5, 'nu': 0.01}


In [57]:
def detect_anomalies_ocsvm(data, gamma, nu):
    oc_svm = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu)
    oc_svm.fit(data.values.reshape(-1, 1))
    preds = oc_svm.predict(data.values.reshape(-1, 1))
    return data.index[preds == -1]

def detect_anomalies_isolation_forest(data, n_estimators, contamination):
    iso_forest = IsolationForest(100, 256)
    iso_forest.forest(data.values.reshape(-1, 1))
    anomaly_scores = iso_forest.isolation_forest_anomaly_score(data.values.reshape(-1, 1), iso_forest.trees[1])
    anomaly_score_df = pd.DataFrame(anomaly_scores, index=data.index, columns=['Anomaly Score'])
    anomaly_score_df['Label'] = np.where(anomaly_score_df['Anomaly Score'] > 0.9, -1, 1)
    return anomaly_score_df.index[anomaly_score_df['Label'] == -1]

def plot_anomalies(fig, country, data, anomalies, method_name):
    fig.add_trace(go.Scatter(x=data.index, y=data, mode='lines', name=f'{country} Trend'))
    fig.add_trace(go.Scatter(x=anomalies, y=data.loc[anomalies], mode='markers', name=f'{country} {method_name} Anomalies', marker=dict(color='red', size=10)))

def plot_clusters_anomalies(slopes_df, all_data, gamma, nu, n_estimators, contamination):
    unique_clusters = slopes_df['Cluster'].unique()
    
    for cluster in unique_clusters:
        cluster_df = slopes_df[slopes_df['Cluster'] == cluster]
        fig = go.Figure()

        for country in cluster_df['Country']:
            country_data = all_data[country].dropna()

            # Detect anomalies using One-Class SVM
            # anomalies_ocsvm = detect_anomalies_ocsvm(country_data, gamma, nu)
            # plot_anomalies(fig, country, country_data, anomalies_ocsvm, 'One-Class SVM')

            # Detect anomalies using Isolation Forest
            anomalies_isoforest = detect_anomalies_isolation_forest(country_data, n_estimators, contamination)
            plot_anomalies(fig, country, country_data, anomalies_isoforest, 'Isolation Forest')

        fig.update_layout(title=f"Anomalies in Normalized Trend Data for Countries in Cluster {cluster}",
                          xaxis_title='Time',
                          yaxis_title='Normalized Trend',
                          showlegend=True)
        fig.show()

# Example usage
#plot_clusters_anomalies(slopes_df, all_normalized_trends, gamma=0.5, nu=0.01, n_estimators=100, contamination='auto')

In [58]:
def calculate_cluster_means(slopes_df, all_data):
    cluster_means = {}
    unique_clusters = slopes_df['Cluster'].unique()

    for cluster in unique_clusters:
        countries_in_cluster = slopes_df[slopes_df['Cluster'] == cluster]['Country']
        cluster_data = all_data[countries_in_cluster].mean(axis=1)
        cluster_means[cluster] = cluster_data

    return pd.DataFrame(cluster_means)

def plot_cluster_anomalies(cluster_means, gamma, nu, n_estimators, contamination):
    fig = go.Figure()

    for cluster, data in cluster_means.items():
        data = data.dropna()

        # Detect anomalies using One-Class SVM
        anomalies_ocsvm = detect_anomalies_ocsvm(data, gamma, nu)
        plot_anomalies(fig, f'Cluster {cluster}', data, anomalies_ocsvm, 'One-Class SVM')

        # Detect anomalies using Isolation Forest
        anomalies_isoforest = detect_anomalies_isolation_forest(data, n_estimators, contamination)
        plot_anomalies(fig, f'Cluster {cluster}', data, anomalies_isoforest, 'Isolation Forest')

    fig.update_layout(title="Anomalies in Normalized Mean Temperature for Each Cluster",
                      xaxis_title='Time',
                      yaxis_title='Normalized Mean Temperature',
                      showlegend=True)
    fig.show()

# Calculating cluster means
cluster_means = calculate_cluster_means(slopes_df, all_normalized_trends)

# Plotting anomalies for cluster means
plot_cluster_anomalies(cluster_means, gamma=0.5, nu=0.01, n_estimators=100, contamination='auto')