In [149]:
from functions.data_by_country import data_by_country
from utils.data_loader import load_data_parquet
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.express as px

In [150]:
df = load_data_parquet()
df = data_by_country(df)
df = df.dropna()





In [296]:
# To show how the Isolation model works, I've picked Sweden
df_sweden = df[df["Country"]=="Sweden"]
df_sweden3 = df_sweden.copy()
df_sweden3 = df_sweden3.set_index("dt")
df_sweden3

Unnamed: 0_level_0,Country,Country_ISO,AverageTemperature
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1743-11-01,Sweden,SWE,3.7652
1744-04-01,Sweden,SWE,3.3624
1744-05-01,Sweden,SWE,8.8490
1744-06-01,Sweden,SWE,13.0116
1744-07-01,Sweden,SWE,15.6892
...,...,...,...
2013-04-01,Sweden,SWE,3.3374
2013-05-01,Sweden,SWE,11.9138
2013-06-01,Sweden,SWE,14.8738
2013-07-01,Sweden,SWE,17.3276


In [236]:
df_sweden3.index

DatetimeIndex(['1743-11-01', '1744-04-01', '1744-05-01', '1744-06-01',
               '1744-07-01', '1744-09-01', '1744-10-01', '1744-11-01',
               '1744-12-01', '1745-01-01',
               ...
               '2012-11-01', '2012-12-01', '2013-01-01', '2013-02-01',
               '2013-03-01', '2013-04-01', '2013-05-01', '2013-06-01',
               '2013-07-01', '2013-08-01'],
              dtype='datetime64[ns]', name='dt', length=3166, freq=None)

In [292]:
average_by_year = df_sweden3.groupby(df_sweden3.index.year)['AverageTemperature'].mean()

In [293]:
# Reset the index to make 'Year' and 'Country' as regular columns
average_by_year

dt
1743    3.765200
1744    7.161400
1745   -3.072250
1750    6.770073
1751    6.014925
          ...   
2009    6.769717
2010    5.000933
2011    7.460750
2012    6.432900
2013    6.765275
Name: AverageTemperature, Length: 267, dtype: float64

# Let us look at the complete algorithm step by step:

1. When given a dataset, a random sub-sample of the data is selected and assigned to a binary tree.
2. Branching of the tree starts by selecting a random feature (from the set of all N features) first. And then branching is done on a random threshold ( any value in the range of minimum and maximum values of the selected feature).
3. If the value of a data point is less than the selected threshold, it goes to the left branch else to the right. And thus a node is split into left and right branches.
4. This process from step 2 is continued recursively till each data point is completely isolated or till max depth(if defined) is reached.
5. The above steps are repeated to construct random binary trees.

In [294]:
# Preparing the data
df_to_use = pd.DataFrame(average_by_year)
df_to_use.AverageTemperature.values.reshape(-1,1)

array([[ 3.7652    ],
       [ 7.1614    ],
       [-3.07225   ],
       [ 6.77007273],
       [ 6.014925  ],
       [ 1.6149    ],
       [ 5.41813333],
       [ 5.41725   ],
       [ 5.0583    ],
       [ 5.61021667],
       [ 5.80848333],
       [ 4.65318333],
       [ 5.9196    ],
       [ 5.07496667],
       [ 6.23558333],
       [ 5.38956667],
       [ 5.03073333],
       [ 5.75613333],
       [ 5.23873333],
       [ 6.143     ],
       [ 4.94206667],
       [ 4.9355    ],
       [ 5.26506667],
       [ 5.30828333],
       [ 4.52396667],
       [ 5.1634    ],
       [ 6.60021667],
       [ 4.8293    ],
       [ 6.95208333],
       [ 5.7814    ],
       [ 5.11296667],
       [ 5.62486667],
       [ 7.31575   ],
       [ 5.40716667],
       [ 6.30205   ],
       [ 4.70543333],
       [ 6.23911667],
       [ 4.08048333],
       [ 4.30881667],
       [ 4.28198333],
       [ 5.5651    ],
       [ 4.7558    ],
       [ 6.11526667],
       [ 5.94825   ],
       [ 6.47626667],
       [ 5

In [289]:
df_to_use

Unnamed: 0_level_0,AverageTemperature
dt,Unnamed: 1_level_1
1743,3.765200
1744,7.161400
1745,-3.072250
1750,6.770073
1751,6.014925
...,...
2009,6.769717
2010,5.000933
2011,7.460750
2012,6.432900


In [156]:
# Function to construct an isolation tree
def isolation_tree(data, height=0, max_height=None):
    """
    Construct an isolation tree for the given data.

    An isolation tree is a binary tree used in isolation forest, an anomaly detection
    algorithm. The tree is built by recursively partitioning the data into two
    sub-samples until a predefined maximum height is reached or there is only one
    data point left.

    Parameters:
    -----------
    data : numpy.ndarray
        The input data to be used for constructing the isolation tree.

    height : int, optional
        The current height of the tree. This parameter is used internally for recursion
        and should not be set manually. By default, it is set to 0.

    max_height : int, optional
        The maximum height of the isolation tree. Once the tree reaches this height or
        there is only one data point left, a leaf node is created. If not specified,
        it is set to log2(len(data)), effectively limiting the tree's depth based on
        the data size.

    Returns:
    --------
    dict
        A dictionary representing a node in the isolation tree. The structure of the
        dictionary varies depending on whether it's a non-leaf or leaf node. Non-leaf
        nodes include information about the splitting attribute, split value, and
        references to their left and right child nodes. Leaf nodes contain the data
        points and their height in the tree.
    """
    if max_height is None:
        # Using a base-2 logarithm allows us to control the depth of the tree in a way that aligns with the binary tree structure.
        max_height = int(np.log2(len(data)))

    # If the maximum height is reached or there's only one data point, create a leaf node
    if max_height <= 0 or len(data) <= 1:
        return {
            "data": data,
            "height": height
        }
    else:
        # Randomly select an attribute and split value
        split_attribute = np.random.randint(0, data.shape[1])
        split_value = np.random.uniform(data[:, split_attribute].min(), data[:, split_attribute].max())
        
        # Partition data into left and right sub-samples and create non-leaf nodes
        left_data = data[data[:, split_attribute] < split_value]
        right_data = data[data[:, split_attribute] >= split_value]
        
        return {
            "split_attribute": split_attribute,
            "split_value": split_value,
            "left": isolation_tree(left_data, height + 1, max_height),
            "right": isolation_tree(right_data, height + 1, max_height)
        }

In [157]:
result = isolation_tree(df_to_use.AverageTemperature.values.reshape(-1,1))
result

{'split_attribute': 0,
 'split_value': 3.6197965415107416,
 'left': {'split_attribute': 0,
  'split_value': -2.089342018333193,
  'left': {'data': array([[-3.07225]]), 'height': 2},
  'right': {'data': array([[1.6149]]), 'height': 2}},
 'right': {'split_attribute': 0,
  'split_value': 7.458973249644989,
  'left': {'split_attribute': 0,
   'split_value': 4.787604048200894,
   'left': {'split_attribute': 0,
    'split_value': 3.7425672016035767,
    'left': {'split_attribute': 0,
     'split_value': 3.6277299954952307,
     'left': {'data': array([[3.62098333]]), 'height': 5},
     'right': {'data': array([[3.62936667]]), 'height': 5}},
    'right': {'split_attribute': 0,
     'split_value': 4.5609414481389265,
     'left': {'split_attribute': 0,
      'split_value': 3.979613394901123,
      'left': {'split_attribute': 0,
       'split_value': 3.7920476441901676,
       'left': {'split_attribute': 0,
        'split_value': 3.768975273507219,
        'left': {'split_attribute': 0,
       

In [158]:
# Function to build an isolation forest
def isolation_forest(data, n_trees=100, subsample_size=256):
    """
    Build an Isolation Forest for anomaly detection.

    Isolation Forest is an ensemble-based anomaly detection algorithm that constructs
    a forest of isolation trees. It measures the ease with which a data point can be
    separated from the rest of the data, with anomalies being isolated more quickly.

    Parameters:
    -----------
    data : numpy.ndarray
        The input data to be used for constructing the Isolation Forest. It should be
        a two-dimensional numpy array where rows represent data points and columns
        represent features.

    n_trees : int, optional
        The number of isolation trees to create in the forest. More trees may improve
        accuracy but also increase computation time. Default is 100.

    subsample_size : int, optional
        The size of the random subsamples to be used for constructing each isolation
        tree. A smaller subsample size can result in more robust trees but may require
        more trees in the forest. Default is 256.

    Returns:
    --------
    list
        A list of isolation trees that collectively form the Isolation Forest. Each
        tree is represented as a nested dictionary structure and can be used for
        anomaly score calculation.
    """
    trees = []

    for i in range(n_trees):
        # Create random subsamples and construct isolation trees
        subsample = data[np.random.choice(data.shape[0], subsample_size, replace=False)]
        itree = isolation_tree(subsample)
        trees.append(itree)

    return trees

In [159]:
result2 = isolation_forest(df_to_use.AverageTemperature.values.reshape(-1,1))
result2[1]

{'split_attribute': 0,
 'split_value': -2.662834793555702,
 'left': {'data': array([[-3.07225]]), 'height': 1},
 'right': {'split_attribute': 0,
  'split_value': 2.85130581221123,
  'left': {'data': array([[1.6149]]), 'height': 2},
  'right': {'split_attribute': 0,
   'split_value': 4.128131759155943,
   'left': {'split_attribute': 0,
    'split_value': 3.9096892969084758,
    'left': {'split_attribute': 0,
     'split_value': 3.66284395247444,
     'left': {'split_attribute': 0,
      'split_value': 3.622921049357996,
      'left': {'data': array([[3.62098333]]), 'height': 6},
      'right': {'data': array([[3.62936667]]), 'height': 6}},
     'right': {'split_attribute': 0,
      'split_value': 3.8156703338737215,
      'left': {'split_attribute': 0,
       'split_value': 3.7651783412180855,
       'left': {'data': array([[3.76505]]), 'height': 7},
       'right': {'data': array([[3.7652]]), 'height': 7}},
      'right': {'split_attribute': 0,
       'split_value': 3.8672408514498264,

In [160]:
# Function to calculate anomaly score for a single data point
def anomaly_score(tree, point, current_height=0):
    """
    Calculate the anomaly score for a single data point using an isolation tree.

    Anomaly scores are used to measure the degree of isolation of a data point within
    the isolation tree. Higher scores indicate anomalies, while lower scores suggest
    that the data point is normal.

    Parameters:
    -----------
    tree : dict
        A dictionary representing an isolation tree. The tree can be constructed using
        the 'isolation_tree' function. The structure includes non-leaf and leaf nodes
        with information about splitting attributes, split values, and references to
        left and right child nodes.

    point : numpy.ndarray
        The data point for which the anomaly score is calculated. It should be a
        one-dimensional numpy array with the same number of attributes as the data
        used to construct the isolation tree.

    current_height : int, optional
        The current height in the tree. This parameter is used internally for recursion
        and should not be set manually. By default, it is set to 0.

    Returns:
    --------
    float
        The anomaly score for the given data point. Higher scores indicate anomalies, while
        lower scores suggest normal data.
    """
    if "split_attribute" not in tree:
        # If it's a leaf node, return the anomaly score
        return current_height + 2 * (np.log(2 ** tree["height"] - 1) + np.euler_gamma) - (2 * (tree["height"] - 1) / (2 ** tree["height"] - 1))
    else:
        # Randomly select an attribute for splitting
        split_attribute = tree["split_attribute"]
        
        # Determine which branch to traverse based on the split attribute and value
        if point[split_attribute] < tree["split_value"]:
            return anomaly_score(tree["left"], point, current_height + 1)
        else:
            return anomaly_score(tree["right"], point, current_height + 1)


In [161]:
# Function to compute anomaly scores for the entire dataset
def isolation_forest_anomaly_score(data, trees):
    """
    Compute anomaly scores for the entire dataset using an Isolation Forest.

    Anomaly scores measure the degree of isolation of data points within the Isolation
    Forest. In this context, an anomaly is considered when the anomaly score is higher
    than some threshold, with lower scores indicating normal data points.

    Parameters:
    -----------
    data : numpy.ndarray
        The input data for which anomaly scores are calculated. It should be a
        two-dimensional numpy array where rows represent data points and columns
        represent features.

    trees : list
        A list of isolation trees that collectively form the Isolation Forest. Each tree
        is represented as a nested dictionary structure and can be used for anomaly score
        calculation.

    Returns:
    --------
    numpy.ndarray
        An array of anomaly scores for each data point in the input dataset. An anomaly is
        considered when the anomaly score is higher than some threshold, with lower scores indicating
        normal data points.
    """
    scores = np.array([anomaly_score(trees, point) for point in data])
    # normalize the anomaly scores and map them to a scale
    return 2 ** (-scores / len(trees))

In [162]:
# Build an isolation forest
forest = isolation_forest(df_to_use.AverageTemperature.values.reshape(-1,1), n_trees=10, subsample_size=200)
forest

[{'split_attribute': 0,
  'split_value': 3.275333838773157,
  'left': {'split_attribute': 0,
   'split_value': -1.490636985249517,
   'left': {'data': array([[-3.07225]]), 'height': 2},
   'right': {'data': array([[1.6149]]), 'height': 2}},
  'right': {'split_attribute': 0,
   'split_value': 7.089804263195729,
   'left': {'split_attribute': 0,
    'split_value': 5.27836817849203,
    'left': {'split_attribute': 0,
     'split_value': 4.013016455526477,
     'left': {'split_attribute': 0,
      'split_value': 3.813579359088192,
      'left': {'split_attribute': 0,
       'split_value': 3.6895181197731945,
       'left': {'split_attribute': 0,
        'split_value': 3.6218575893241898,
        'left': {'data': array([[3.62098333]]), 'height': 7},
        'right': {'data': array([[3.62936667]]), 'height': 7}},
       'right': {'split_attribute': 0,
        'split_value': 3.7724969626413754,
        'left': {'split_attribute': 0,
         'split_value': 3.7650714236019396,
         'left':

In [163]:
# Calculate anomaly scores for the data
anomaly_scores = isolation_forest_anomaly_score(df_to_use.AverageTemperature.values.reshape(-1,1), forest[1])

In [181]:
anomaly_scores > 0.5

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [223]:
# Function to create an interactive Plotly plot for visualizing anomalies
def visualize_anomalies_with_years_plotly(data, anomaly_scores, threshold, years, countries):
    # Flatten the anomaly_scores array
    anomaly_scores = anomaly_scores.ravel()

    # Create a DataFrame with years, data, anomaly scores, and country information
    df = pd.DataFrame({'Year': years, 'Data': data.ravel(), 'Anomaly Score': anomaly_scores, 'Country': countries, "Threshold": threshold})

    # Create a new column for color based on the anomaly scores and the threshold
    df['Color'] = df.apply(lambda row: 'Anomaly' if row['Anomaly Score'] > row['Threshold'] else 'Not Anomaly', axis=1)

    # Create a text label to display the anomaly score and country when hovering over a point
    df['Hover Label'] = df.apply(lambda row: f"Country: {row['Country']}<br>Anomaly Score: {row['Anomaly Score']:.2f}", axis=1)

    # Create an interactive scatter plot
    fig = px.scatter(
        df, x='Year', y='Data',
        color='Color',
        title='Anomaly Detection',
        labels={'Data': 'Data Value'},
        hover_name='Hover Label',  # Show 'Hover Label' when hovering over points
    )

    # Show the interactive plot
    fig.show()


In [297]:
number_of_countries = len(df_to_use.AverageTemperature.values.reshape(-1,1))

In [298]:
visualize_anomalies_with_years_plotly(df_to_use.AverageTemperature.values.reshape(-1,1), anomaly_scores, 0.4, df_to_use.index, df_sweden3.Country[:number_of_countries])

# What if we remove the anamolies and try again?

In [210]:
def remove_outliers_and_recompute(data, trees, threshold, years):
    # Calculate the anomaly scores for the entire dataset
    anomaly_scores = isolation_forest_anomaly_score(data, trees)
    
    # Identify the indices of data points with anomaly scores above the threshold
    outliers_indices = np.where(anomaly_scores > threshold)[0]
    
    if len(outliers_indices) > 0:
        # Remove outliers from the data
        data_without_outliers = np.delete(data, outliers_indices, axis=0)
        # For visualization purposes
        cleaned_years  = np.delete(years, outliers_indices)
        
        # Compute new anomaly scores for the remaining data using one of the new trees
        new_tree = isolation_tree(data_without_outliers)
        new_anomaly_scores = isolation_forest_anomaly_score(data_without_outliers, new_tree)
    else:
        # If there are no outliers, return the original data and scores
        data_without_outliers = data
        new_anomaly_scores = anomaly_scores

    return data_without_outliers, new_anomaly_scores, cleaned_years

In [211]:
data_without_outliers, new_anomaly_scores, cleaned_years = remove_outliers_and_recompute(df_to_use.AverageTemperature.values.reshape(-1,1), forest[1],0.5, df_to_use.index)

In [214]:
print(cleaned_years)
df_to_use.index

Index([1743, 1744, 1750, 1751, 1752, 1753, 1754, 1755, 1756, 1757,
       ...
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013],
      dtype='int32', name='dt', length=266)


Index([1743, 1744, 1745, 1750, 1751, 1752, 1753, 1754, 1755, 1756,
       ...
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013],
      dtype='int32', name='dt', length=267)

In [208]:
data_without_outliers.shape

(266, 1)

In [215]:
new_anomaly_scores.shape

array([False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [213]:
visualize_anomalies_with_years_plotly(data_without_outliers, new_anomaly_scores, 0.4, cleaned_years)