In [14]:
from river import datasets
from river import linear_model, tree
from river import drift, metrics
from river.datasets import synth
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define the datasets
'lea': Local Expanding Abrupt drift.
The concept drift appears in two distinct regions of the instance space, while the remaining regions are left unaltered. There are three points of abrupt change in the training dataset. At every consecutive change the regions of drift are expanded.



In [15]:
dataset = synth.FriedmanDrift(
    drift_type='lea',
    position=(1, 2, 3),
    seed=42
)

In [16]:
# Initialize the data containers
data = []
for i, (x, y) in enumerate(dataset):
    x_values = list(x.values())
    data.append(x_values + [y])
    if i >= 5000:  # Limiting to 400 samples for simplicity
        break

In [17]:
# Define the column names
column_names = [f'x{i}' for i in range(1, len(x_values) + 1)] + ['y']

# Create the DataFrame
df = pd.DataFrame(data, columns=column_names)

In [18]:
# Split the DataFrame into train and test sets
df_train = df.iloc[:2500]
df_test = df.iloc[2500:]

print("Train DataFrame:")
print(df_train.head())

print("\nTest DataFrame:")
print(df_test.head())

Train DataFrame:
         x1        x2        x3        x4        x5        x6        x7  \
0  0.639427  0.025011  0.275029  0.223211  0.736471  0.676699  0.892180   
1  0.026536  0.198838  0.649884  0.544941  0.220441  0.589266  0.809430   
2  0.340251  0.155479  0.957213  0.336595  0.092746  0.096716  0.847494   
3  0.378534  0.552041  0.829405  0.618520  0.861707  0.577352  0.704572   
4  0.079792  0.232791  0.101001  0.277974  0.635684  0.364832  0.370181   

         x8        x9       x10          y  
0  0.086939  0.421922  0.029797   7.661207  
1  0.006499  0.805819  0.698139   8.330237  
2  0.603726  0.807128  0.729732   7.044929  
3  0.045824  0.227898  0.289388  18.160370  
4  0.209507  0.266978  0.936655  -2.655357  

Test DataFrame:
            x1        x2        x3        x4        x5        x6        x7  \
2500  0.364768  0.555611  0.730283  0.160368  0.207994  0.796483  0.543040   
2501  0.578037  0.166128  0.252746  0.637494  0.139315  0.466338  0.961549   
2502  0.887

In [19]:
# Split features and target
X_train = df_train.drop(columns='y').values
y_train = df_train['y'].values
X_test = df_test.drop(columns='y').values
y_test = df_test['y'].values

# Fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [20]:
# Calculate the initial error
initial_mse = mean_squared_error(y_test, y_pred)

In [21]:
print("Initial Mean Squared Error:", initial_mse)

Initial Mean Squared Error: 14.096163441654177


In [22]:
# ADWIN drift detector - river
# adwin = drift.ADWIN()

In [29]:
pip install scikit-multiflow

Collecting scikit-multiflow
  Downloading scikit-multiflow-0.5.3.tar.gz (450 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/450.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/450.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.6/450.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-multiflow
  Building wheel for scikit-multiflow (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-multiflow: filename=scikit_multiflow-0.5.3-cp310-cp310-linux_x86_64.whl size=1254708 sha256=375575e302fa7da15c9fe7af9838d4789e82240842ae39bfe1c61f58140eb4ca
  Stored in directory: /root/.cache/pip/wheels/6e/1b/56/45b17a6cf203d98000a45976cb0dd0c4c3f11960e6a505f231
Successfully built scikit-multiflow
Installing collected packages

In [31]:
from skmultiflow.drift_detection.adwin import ADWIN
# ADWIN drift detector - skmultiflow
adwinMUL = ADWIN()

In [39]:
from skmultiflow.drift_detection import DDM
# DDM drift detector - skmultiflow
ddmMUL = DDM()

In [42]:
from skmultiflow.drift_detection.eddm import EDDM
# EDDM drift detector - skmultiflow
eddmMUL = EDDM()

In [45]:
from skmultiflow.drift_detection import PageHinkley
# PageHinkley drift detector - skmultiflow
ph = PageHinkley()

In [46]:
# Function to check for drift using ADWIN
def check_drift(y_true, y_pred):
    drift_detected = False
    for yt, yp in zip(y_true, y_pred):
        error = (yt - yp) ** 2
        ph.add_element(error)
        if ph.detected_change():
            drift_detected = True
    return drift_detected


In [47]:
# Check for drift
drift_detected = check_drift(y_test, y_pred)

print(f"Initial MSE: {initial_mse}")
print(f"Drift Detected: {drift_detected}")

if drift_detected:
    print("Concept drift detected.")
else:
    print("No concept drift detected.")

Initial MSE: 14.096163441654177
Drift Detected: True
Concept drift detected.


# Linear Regression - River

*   ADWIN: not detected
*   Page Hinkley: detected

# Linear Regression - skmultiflow

*   ADWIN: not detected
*   DDM: not detected
*   EDDM: not detected
*   Page Hinkley: detected
