# Data Import

In [1]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from geopy.distance import geodesic

from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:
dataset_path = '/kaggle/input/sncb-data-augumentation/enriched_cleaned_ar41_for_ulb.csv'

# Check if the file exists before trying to read it
if os.path.exists(dataset_path):
    data = pd.read_csv(dataset_path)

    # Display the basic information and the first few rows of the dataframe
    data_info = data.info()
    data_head = data.head()

    # If you want to print the information to the console
    print(data_info)
    print(data_head)
else:
    print(f"The file {dataset_path} does not exist.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17677337 entries, 0 to 17677336
Data columns (total 30 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Unnamed: 0          int64  
 1   timestamps_UTC      object 
 2   mapped_veh_id       float64
 3   lat                 float64
 4   lon                 float64
 5   RS_E_InAirTemp_PC1  float64
 6   RS_E_InAirTemp_PC2  float64
 7   RS_E_OilPress_PC1   float64
 8   RS_E_OilPress_PC2   float64
 9   RS_E_RPM_PC1        float64
 10  RS_E_RPM_PC2        float64
 11  RS_E_WatTemp_PC1    float64
 12  RS_E_WatTemp_PC2    float64
 13  RS_T_OilTemp_PC1    float64
 14  RS_T_OilTemp_PC2    float64
 15  date                object 
 16  hour                float64
 17  dayofweek           float64
 18  weekday             object 
 19  Distance            float64
 20  Speed               float64
 21  date_hour           object 
 22  datetime            object 
 23  weather_main        object 
 24  temp                fl

# Naive Labling -- KMeans

In [3]:
# Define numeric and categorical columns
numeric_columns = ['mapped_veh_id', 'lat', 'lon', 'RS_E_InAirTemp_PC1', 'RS_E_InAirTemp_PC2', 'RS_E_OilPress_PC1', 'RS_E_OilPress_PC2', 'RS_E_RPM_PC1', 'RS_E_RPM_PC2', 'RS_E_WatTemp_PC1', 'RS_E_WatTemp_PC2', 'RS_T_OilTemp_PC1', 'RS_T_OilTemp_PC2', 'hour', 'Distance', 'Speed', 'temp', 'feels_like', 'pressure', 'humidity', 'wind', 'clouds']
categorical_columns = ['weather_main', 'weekday']

# Preprocessing process
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle null values
    ('scaler', StandardScaler())])  # Standardization

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle null values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])  # One Hot Coding

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)])

# Apply preprocessing
data_processed = preprocessor.fit_transform(data)

# K-Means Clusttering
kmeans = KMeans(n_clusters=2)  # Suppose we wish to separate into two groups: normal and abnormal
clusters = kmeans.fit_predict(data_processed)
data['KMeans_cluster'] = clusters

# View clustering results
print(data['KMeans_cluster'].value_counts())



KMeans_cluster
1    11710210
0     5967127
Name: count, dtype: int64


In [4]:
data

Unnamed: 0.1,Unnamed: 0,timestamps_UTC,mapped_veh_id,lat,lon,RS_E_InAirTemp_PC1,RS_E_InAirTemp_PC2,RS_E_OilPress_PC1,RS_E_OilPress_PC2,RS_E_RPM_PC1,...,date_hour,datetime,weather_main,temp,feels_like,pressure,humidity,wind,clouds,KMeans_cluster
0,0,2023-01-23 07:25:08,102.0,51.02,3.77,17.0,18.0,210.0,210.0,858.0,...,2023-01-23 07:00:00,2023-01-23 07:00:00,Clouds,274.54,271.75,1035.0,92.0,2.52,100.0,0
1,1,2023-01-23 07:25:16,102.0,51.02,3.77,17.0,20.0,200.0,200.0,801.0,...,2023-01-23 07:00:00,2023-01-23 07:00:00,Clouds,274.54,271.75,1035.0,92.0,2.52,100.0,0
2,2,2023-01-23 07:25:37,102.0,51.02,3.77,19.0,20.0,193.0,207.0,803.0,...,2023-01-23 07:00:00,2023-01-23 07:00:00,Clouds,274.54,271.75,1035.0,92.0,2.52,100.0,0
3,3,2023-01-23 07:25:41,102.0,51.02,3.77,19.0,20.0,196.0,203.0,801.0,...,2023-01-23 07:00:00,2023-01-23 07:00:00,Clouds,274.54,271.75,1035.0,92.0,2.52,100.0,0
4,4,2023-01-23 07:26:10,102.0,51.02,3.77,19.0,21.0,200.0,203.0,795.0,...,2023-01-23 07:00:00,2023-01-23 07:00:00,Clouds,274.54,271.75,1035.0,92.0,2.52,100.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17677332,17677332,2023-09-13 17:33:03,197.0,50.40,4.45,37.0,40.0,220.0,258.0,803.0,...,2023-09-13 18:00:00,2023-09-13 18:00:00,Clouds,290.38,290.22,1021.0,79.0,3.60,20.0,1
17677333,17677333,2023-09-13 17:33:58,197.0,50.40,4.45,37.0,38.0,224.0,307.0,843.0,...,2023-09-13 18:00:00,2023-09-13 18:00:00,Clouds,290.38,290.22,1021.0,79.0,3.60,20.0,1
17677334,17677334,2023-09-13 17:34:03,197.0,50.40,4.45,37.0,38.0,224.0,307.0,841.0,...,2023-09-13 18:00:00,2023-09-13 18:00:00,Clouds,290.38,290.22,1021.0,79.0,3.60,20.0,1
17677335,17677335,2023-09-13 17:34:58,197.0,50.40,4.46,36.0,38.0,207.0,244.0,800.0,...,2023-09-13 18:00:00,2023-09-13 18:00:00,Clouds,290.38,290.22,1021.0,79.0,3.60,20.0,1


# Save to CSV

In [5]:
data.to_csv('enriched_cleaned_ar41_for_ulb.csv', index=True)