## Importing Requirements

In [29]:
import pandas as pd
from scipy.io import arff
from fastdtw import fastdtw
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


## Reading the data

In [30]:
arff_file = arff.loadarff('ElectricDevices_TRAIN.arff')


In [31]:
df = pd.DataFrame(arff_file[0])
df.head()


Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att88,att89,att90,att91,att92,att93,att94,att95,att96,target
0,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,...,2.91274,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,-0.186109,b'1'
1,-0.177005,-0.177005,-0.177005,-0.177005,-0.177005,-0.177005,-0.177005,-0.177005,-0.177005,-0.177005,...,6.47667,1.645079,-0.177005,-0.177005,-0.177005,-0.177005,-0.177005,-0.177005,-0.177005,b'1'
2,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,...,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,-0.213535,b'1'
3,-0.171472,-0.171472,-0.171472,-0.171472,-0.171472,-0.171472,-0.171472,-0.171472,-0.171472,-0.171472,...,0.180433,0.160882,7.355375,1.177495,-0.171472,-0.171472,-0.171472,-0.171472,-0.171472,b'1'
4,-0.169266,-0.169266,-0.169266,-0.169266,-0.169266,-0.169266,-0.169266,-0.169266,-0.169266,-0.169266,...,6.074392,0.170933,0.190945,0.190945,7.395165,0.851332,-0.169266,-0.169266,-0.169266,b'1'


In [32]:
df.tail()


Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att88,att89,att90,att91,att92,att93,att94,att95,att96,target
8921,-0.611983,-0.611983,-0.611983,-0.611983,-0.611983,-0.611983,-0.611983,-0.611983,-0.611983,-0.611983,...,1.628497,1.711478,1.711478,1.711478,1.628497,-0.611983,-0.611983,-0.611983,-0.611983,b'5'
8922,-0.498369,-0.498369,-0.498369,-0.498369,-0.498369,-0.498369,-0.498369,-0.498369,-0.498369,-0.498369,...,2.315952,2.315952,2.218906,2.218906,1.733678,-0.498369,-0.498369,-0.498369,-0.498369,b'5'
8923,-0.399394,-0.399394,-0.399394,-0.399394,-0.399394,-0.399394,-0.399394,-0.399394,-0.399394,-0.399394,...,2.64694,2.64694,2.436848,2.64694,2.541894,0.440974,-0.399394,-0.399394,-0.399394,b'5'
8924,-0.595599,-0.595599,-0.595599,-0.595599,-0.595599,-0.595599,-0.595599,-0.595599,-0.595599,-0.595599,...,1.783125,1.783125,1.871226,1.783125,1.783125,1.783125,1.871226,1.871226,1.783125,b'5'
8925,-0.545183,-0.545183,-0.545183,-0.545183,-0.545183,-0.545183,-0.545183,-0.545183,-0.545183,-0.545183,...,2.003086,2.199107,2.101097,2.199107,2.101097,0.04288,-0.545183,-0.545183,-0.545183,b'5'


In [33]:
df['target'].tail(50)


8876    b'5'
8877    b'5'
8878    b'5'
8879    b'5'
8880    b'5'
8881    b'5'
8882    b'5'
8883    b'5'
8884    b'5'
8885    b'5'
8886    b'5'
8887    b'5'
8888    b'5'
8889    b'5'
8890    b'5'
8891    b'5'
8892    b'5'
8893    b'5'
8894    b'5'
8895    b'5'
8896    b'5'
8897    b'5'
8898    b'5'
8899    b'5'
8900    b'5'
8901    b'5'
8902    b'5'
8903    b'5'
8904    b'5'
8905    b'5'
8906    b'5'
8907    b'5'
8908    b'5'
8909    b'5'
8910    b'5'
8911    b'5'
8912    b'5'
8913    b'5'
8914    b'5'
8915    b'5'
8916    b'5'
8917    b'5'
8918    b'5'
8919    b'5'
8920    b'5'
8921    b'5'
8922    b'5'
8923    b'5'
8924    b'5'
8925    b'5'
Name: target, dtype: object

## Pre-processing

In [34]:
df.fillna(method='ffill', inplace=True)

In [35]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2)

In [36]:
# Check the type and shape of X_train and y_train
print("X_train shape:", X_train.shape)
print("y_train type:", type(y_train))
print("Unique labels in y_train:", y_train.unique())

X_train shape: (7140, 96)
y_train type: <class 'pandas.core.series.Series'>
Unique labels in y_train: [b'4' b'1' b'2' b'3' b'5' b'7' b'6']


In [37]:
from sklearn.preprocessing import LabelEncoder

# Convert byte-string labels to strings
y_train_str = y_train.str.decode('utf-8')

# Encode labels into numerical format
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_str)


def dtw_distance(ts1, ts2):
    distance, _ = fastdtw(ts1, ts2)
    return distance

# Define kNN with DTW as distance metric
knn_dtw = KNeighborsClassifier(n_neighbors=5, metric=dtw_distance)

# Fit the kNN classifier
knn_dtw.fit(X_train, y_train_encoded)

In [38]:
from sklearn.metrics import accuracy_score

batch_size = 1000
y_pred = []

for i in range(0, len(X_test), batch_size):
    X_batch = X_test.iloc[i:i+batch_size]
    batch_predictions = knn_dtw.predict(X_batch)
    y_pred.extend(batch_predictions)

# Convert test labels to numerical format if needed
y_test_encoded = label_encoder.transform(y_test.str.decode('utf-8'))

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print("Accuracy:", accuracy)


KeyboardInterrupt: 

In [None]:


batch_size = 1000
y_pred = []

for i in range(0, len(X_test), batch_size):
    X_batch = X_test.iloc[i:i+batch_size]
    y_batch_pred = knn_dtw.predict(X_batch)
    y_pred.extend(y_batch_pred)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


## Training Model & Parameter Search

In [22]:
# Predict and evaluate
y_pred = knn_dtw.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import GridSearchCV

knn_dtw = KNeighborsClassifier(metric=dtw_distance)

# Define parameter grid for grid search
param_grid = {'n_neighbors': [3, 5, 7, 10]}
grid_search = GridSearchCV(knn_dtw, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train_encoded)


print("Best Parameters:", grid_search.best_params_)
print("Best CV Score (Accuracy):", grid_search.best_score_)


## Visualisations

In [None]:
import matplotlib.pyplot as plt

# Plot some sample time-series
num_samples = 5
sample_indices = np.random.choice(len(X_train), num_samples, replace=False)

plt.figure(figsize=(12, 6))
for i, idx in enumerate(sample_indices):
    plt.subplot(num_samples, 1, i + 1)
    plt.plot(X_train.iloc[idx])
    plt.title(f"Time-Series Sample {idx}")
    plt.xlabel("Time")
    plt.ylabel("Value")
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

# Count plot of class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x=y_train_str)
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Compute descriptive statistics of time-series features
feature_stats = X_train.describe()

# Visualize statistics (e.g., mean, min, max)
plt.figure(figsize=(10, 6))
sns.boxplot(data=feature_stats.T)
plt.title("Feature Statistics")
plt.xlabel("Statistic")
plt.ylabel("Value")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Compute correlation matrix
corr_matrix = X_train.corr()

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix of Time-Series Features")
plt.show()


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Perform seasonal decomposition
decomposition = seasonal_decompose(X_train.iloc[0], model='additive', period=24)  # Assuming hourly data

# Plot decomposition components
plt.figure(figsize=(12, 8))
decomposition.plot()
plt.suptitle("Seasonal Decomposition of a Time-Series")
plt.show()


## Improving Results

In [None]:
from tsfresh import extract_features

# Extract features from time-series data
X_train_features = extract_features(X_train, column_id='time_series_id', default_fc_parameters=None)


In [None]:
from sklearn.decomposition import PCA

# Applying PCA for dimensionality reduction
pca = PCA(n_components=10)  
X_train_pca = pca.fit_transform(X_train)


In [None]:
import time

#Checking the time based on performance improvements

for 