<a href="https://colab.research.google.com/github/BeverlyAb/concept-drift/blob/drift_induction/Covertype_induction_and_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount the data imports


In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
!pip install evidently

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [23]:
import pandas as pd
import pickle
from sklearn import datasets

from evidently import ColumnMapping
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataDriftTab, CatTargetDriftTab

from evidently.model_profile import Profile
from evidently.model_profile.sections import DataDriftProfileSection, CatTargetDriftProfileSection 

In [24]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Covertype Data

## Drift induction
On the Reliable Detection of Concept Drift from Streaming Unlabeled Data (pg 16)- (https://arxiv.org/pdf/1704.00023.pdf)

1.   Shuffle all data to remove unwanted concept drift
2.   Figure out which features are most/least important to classification using kfold cross validation
3. Separate dataset by specified changepoints into chunks to represent differing concepts. In each chunk, select 25% of features (based on highest/lowest feature importance) per class and rotate them to induce drift. 
4. Features of high importance should impose "real drift", while lowest importance features should impose "virtual drift"


## Dataset Class

In [27]:
from typing import Dict 
class Dataset:
    def __init__(self, full_df: pd.DataFrame, column_mapping: Dict, window_size: int):

        self.full_df = full_df
        self.column_mapping = column_mapping
        self.window_size = window_size
        self.set_splits()
    def set_splits(self):
        """Use the specified window_size to set an attribute that holds corresponding index splits"""
        idx = self.window_size

        splits = []
        while idx < len(self.full_df):
            splits.append(idx)
            idx += self.window_size

        self.splits = splits

    def get_split_idx(self, window_idx):
        """Given a window_idx from an experiment, lookup the split_idx"""
        return self.splits[window_idx]

    def get_window_data(self, window_idx, split_labels=True):
        """
        Given a window_idx corresponding to a split_idx, return the data up to that
        split value starting from the split_idx - 1 value.
        Args:
            window_idx (int) - index corresponding to the end point of the desired data window
            split_labels (bool) - return features and labels separately vs. as one dataframe
        Returns:
            features (pd.DataFrame)
            labels (pd.Series)
        TO-DO: add test to make sure this function gets the expected window data
        """

        end_idx = self.splits[window_idx]

        if window_idx == 0:
            window_data = self.full_df[:end_idx]
        else:
            start_idx = self.splits[window_idx - 1]
            window_data = self.full_df[start_idx:end_idx]

        if split_labels:
            features, labels = self.split_df(window_data, self.column_mapping["target"])
            return features, labels
        else:
            return window_data

    def get_data_by_idx(self, start_idx, end_idx, split_labels=True):
        """
        Given an index into the full_df, return all records up to that observation.
        Args:
            start_idx (int) - index corresponding to the row in full_df
            end_idx (int) - index corresponding to the row in full_df
            split_labels (bool) - return features and labels separately vs. as one dataframe
        Returns:
            features (pd.DataFrame)
            labels (pd.Series)
        TO-DO: should this skip over the first full window that was trained on.. meaning eval data only?
        """

        window_data = self.full_df[start_idx:end_idx]

        if split_labels:
            features, labels = self.split_df(window_data, self.column_mapping["target"])
            return features, labels
        else:
            return window_data

    @staticmethod
    def split_df(df, label_col):
        """Splits the features from labels in a dataframe, returns both"""
        return df.drop(label_col, axis=1), df[label_col]

In [25]:
with open("/content/drive/MyDrive/Grad_School/Research/Concept_drift/Data/covtype_induced_drift_forward_norm.pkl", "rb") as f:
    drift_df, change_points = pickle.load(f)

In [26]:
# specify columns and if categorical
col_names = [
    ("elevation", False),
    ("aspect", False),
    ("slope", False),
    ("horizontal_dist_to_hydrology", False),
    ("vertical_dist_to_hydrology", False),
    ("horizontal_dist_to_roadways", False),
    ("hillshade_9am", False),
    ("hillshade_noon", False),
    ("hillshade_3pm", False),
    ("horizontal_dist_to_fire_points", False),
]
wilderness_area_cols = [(f"wilderness_area_{i+1}", True) for i in range(4)]
col_names = col_names + wilderness_area_cols + [("cover_type", True)]

# Target drift

In [39]:
column_mapping = ColumnMapping()
column_mapping.target = 'cover_type'
column_mapping.prediction = None
column_mapping.numerical_features = [col for col, iscat in col_names if not iscat]
column_mapping.categorical_features = [col for col, iscat in col_names if iscat][:-1]

In [40]:
CT_dataset = Dataset(
    full_df=drift_df, column_mapping=column_mapping, window_size=17500 * 2
)
# changepoints [[0, 123785, 247570, 371355]] 

In [60]:
covtype_target_drift_dashboard = Dashboard(tabs=[CatTargetDriftTab(verbose_level=1)])

In [61]:
# dashboard.calculate(ref_data_sample, prod_data_sample, column_mapping=column_mapping)
covtype_target_drift_dashboard.calculate(CT_dataset.get_data_by_idx(start_idx=0,end_idx=change_points[1],split_labels=False)
, CT_dataset.get_data_by_idx(start_idx=change_points[1],end_idx=change_points[2],split_labels=False), column_mapping=column_mapping)

In [55]:
covtype_target_drift_dashboard.show()

In [56]:
# covtype_target_drift_dashboard.save('covtype_target_drift_dashboard.html')

## Target Profile

In [69]:
covtype_target_drift_profile = Profile(sections=[CatTargetDriftProfileSection()])

In [70]:
covtype_target_drift_profile.calculate(CT_dataset.get_data_by_idx(start_idx=0,end_idx=change_points[1],split_labels=False)
, CT_dataset.get_data_by_idx(start_idx=change_points[1],end_idx=change_points[2],split_labels=False), column_mapping=column_mapping)

In [71]:
covtype_target_drift_profile.json()

'{"cat_target_drift": {"name": "cat_target_drift", "datetime": "2022-08-17 06:04:42.997705", "data": {"utility_columns": {"date": null, "id": null, "target": "cover_type", "prediction": null}, "cat_feature_names": ["wilderness_area_1", "wilderness_area_2", "wilderness_area_3", "wilderness_area_4"], "num_feature_names": ["aspect", "elevation", "hillshade_3pm", "hillshade_9am", "hillshade_noon", "horizontal_dist_to_fire_points", "horizontal_dist_to_hydrology", "horizontal_dist_to_roadways", "slope", "vertical_dist_to_hydrology"], "datetime_feature_names": [], "target_names": null, "metrics": {"target_name": "cover_type", "target_type": "cat", "target_drift": 0.0020902063696361713}}}, "timestamp": "2022-08-17 06:04:55.061731"}'

# Data Drift

In [79]:
covtype_data_drift_dashboard = Dashboard(tabs=[DataDriftTab(verbose_level=0)])

In [80]:
covtype_data_drift_dashboard.calculate(CT_dataset.get_data_by_idx(start_idx=0,end_idx=change_points[1],split_labels=False)
, CT_dataset.get_data_by_idx(start_idx=change_points[1],end_idx=change_points[2],split_labels=False), column_mapping=column_mapping)

In [82]:
covtype_data_drift_dashboard.show()

In [83]:
# covtype_data_drift_dashboard.save('covtype_data_drift_dashboard.html')

## Data Profile

In [72]:
covtype_data_drift_profile = Profile(sections=[DataDriftProfileSection()])

In [73]:
covtype_data_drift_profile.calculate(CT_dataset.get_data_by_idx(start_idx=0,end_idx=change_points[1],split_labels=False)
, CT_dataset.get_data_by_idx(start_idx=change_points[1],end_idx=change_points[2],split_labels=False), column_mapping=column_mapping)

In [75]:
covtype_data_drift_profile.json()

'{"data_drift": {"name": "data_drift", "datetime": "2022-08-17 06:06:24.266386", "data": {"utility_columns": {"date": null, "id": null, "target": "cover_type", "prediction": null}, "cat_feature_names": ["wilderness_area_1", "wilderness_area_2", "wilderness_area_3", "wilderness_area_4", "cover_type"], "num_feature_names": ["aspect", "elevation", "hillshade_3pm", "hillshade_9am", "hillshade_noon", "horizontal_dist_to_fire_points", "horizontal_dist_to_hydrology", "horizontal_dist_to_roadways", "slope", "vertical_dist_to_hydrology"], "datetime_feature_names": [], "target_names": null, "options": {"confidence": null, "drift_share": 0.5, "nbinsx": 10, "xbins": null}, "metrics": {"n_features": 15, "n_drifted_features": 4, "share_drifted_features": 0.26666666666666666, "dataset_drift": false, "aspect": {"current_small_hist": [[1.5223977056993983, 1.6033445086238236, 1.3869208708648055, 0.9725734135799978, 0.7299753605041, 0.6735872682473638, 0.527850708890415, 0.6119481358807611, 0.85091085349

# Target and Data Drift

In [66]:
covtype_target_and_data_drift_dashboard = Dashboard(tabs=[DataDriftTab(verbose_level=0), 
                                                       CatTargetDriftTab(verbose_level=0)])

In [67]:
covtype_target_and_data_drift_dashboard.calculate(CT_dataset.get_data_by_idx(start_idx=0,end_idx=change_points[1],split_labels=False)
, CT_dataset.get_data_by_idx(start_idx=change_points[1],end_idx=change_points[2],split_labels=False), column_mapping=column_mapping)

In [68]:
covtype_target_and_data_drift_dashboard.show()

In [None]:
# covtype_target_and_data_drift_dashboard.save('covtype_target_and_data_drift_dashboard.html')

## Target and Drift Profile

In [76]:
covtype_target_and_data_drift_profile = Profile(sections=[DataDriftProfileSection(), CatTargetDriftProfileSection()])

In [77]:
covtype_target_and_data_drift_profile.calculate(CT_dataset.get_data_by_idx(start_idx=0,end_idx=change_points[1],split_labels=False)
, CT_dataset.get_data_by_idx(start_idx=change_points[1],end_idx=change_points[2],split_labels=False), column_mapping=column_mapping)

In [78]:
covtype_target_and_data_drift_profile.json()

'{"data_drift": {"name": "data_drift", "datetime": "2022-08-17 06:07:37.147134", "data": {"utility_columns": {"date": null, "id": null, "target": "cover_type", "prediction": null}, "cat_feature_names": ["wilderness_area_1", "wilderness_area_2", "wilderness_area_3", "wilderness_area_4", "cover_type"], "num_feature_names": ["aspect", "elevation", "hillshade_3pm", "hillshade_9am", "hillshade_noon", "horizontal_dist_to_fire_points", "horizontal_dist_to_hydrology", "horizontal_dist_to_roadways", "slope", "vertical_dist_to_hydrology"], "datetime_feature_names": [], "target_names": null, "options": {"confidence": null, "drift_share": 0.5, "nbinsx": 10, "xbins": null}, "metrics": {"n_features": 15, "n_drifted_features": 4, "share_drifted_features": 0.26666666666666666, "dataset_drift": false, "aspect": {"current_small_hist": [[1.5223977056993983, 1.6033445086238236, 1.3869208708648055, 0.9725734135799978, 0.7299753605041, 0.6735872682473638, 0.527850708890415, 0.6119481358807611, 0.85091085349

In [None]:
# https://jsonformatter.org