In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

all_columns = ["mapped_veh_id",
               "timestamps_UTC",
               "lat","lon",
               
               "RS_E_OilPress_PC1","RS_E_OilPress_PC2",

               "RS_E_RPM_PC1","RS_E_RPM_PC2",

               "RS_E_InAirTemp_PC1","RS_E_InAirTemp_PC2",
               "RS_E_WatTemp_PC1","RS_E_WatTemp_PC2",
               "RS_T_OilTemp_PC1","RS_T_OilTemp_PC2"]



In [36]:
col_types = {"mapped_veh_id": np.int32,
            #"RS_E_OilPress_PC1": np.int32,
            #"RS_E_OilPress_PC2": np.int32,
            #"RS_E_RPM_PC1": np.int32,
            #"RS_E_RPM_PC2": np.int32,
            #"RS_E_InAirTemp_PC1": np.int32,
            #"RS_E_InAirTemp_PC2": np.int32,
            #"RS_E_WatTemp_PC1": np.int32,
            #"RS_E_WatTemp_PC2": np.int32,
            #"RS_T_OilTemp_PC1": np.int32,
            #"RS_T_OilTemp_PC2": np.int32
            }
data = pd.read_csv("../data/cleaned_sorted_full_data.csv", delimiter=";", index_col=False, dtype=col_types)
data.columns

Index(['mapped_veh_id', 'timestamps_UTC', 'lat', 'lon', 'RS_E_InAirTemp_PC1',
       'RS_E_InAirTemp_PC2', 'RS_E_OilPress_PC1', 'RS_E_OilPress_PC2',
       'RS_E_RPM_PC1', 'RS_E_RPM_PC2', 'RS_E_WatTemp_PC1', 'RS_E_WatTemp_PC2',
       'RS_T_OilTemp_PC1', 'RS_T_OilTemp_PC2', 'z_score'],
      dtype='object')

In [37]:
#____________________CLEANING THE CSV FILE

def clean_csv(data, output_file):

    # Filter out rows where 'RS_E_InAirTemp_PC1' is greater than or equal to 30000
    cleaned_data = data[data['RS_E_InAirTemp_PC1'] < 30000]

    # Filter out rows where 'RS_E_InAirTemp_PC2' is greater than or equal to 1000
    cleaned_data = data[data['RS_E_InAirTemp_PC2'] < 1000]


    # Filter out rows where 'RS_E_OilPress_PC1' is greater than or equal to 1000
    cleaned_data = data[data['RS_E_OilPress_PC1'] < 689]

    

    # Write the cleaned data to a new CSV file
    cleaned_data.to_csv(output_file, index=False, sep=';')

output_file_path = "../data/cleaned_sorted_full_data_bis.csv"
clean_csv(data,output_file_path)

In [33]:
#_______________UAD (Univariate Anomaly Detection):
"""
    This method looks at one feature at a time to identify anomalies. If the anomalies can be detected by examining 
    individual metrics, then we can apply statistical methods to do this. Methods like:
        -Z-Score: Measures how many standard deviations a data point is from the mean. Data points with an absolute z-score above a threshold (e.g., 3) 
            are considered outliers.
        -IQR (Interquartile Range): It's the range between the first quartile and the third quartile. Data points below (1st quartile - 1.5 x IQR) or 
            above (3rd quartile + 1.5 x IQR) can be considered outliers.
"""

def z_score_anomaly_detection(chunk, column_name, threshold=2):
    """
    Detects anomalies in a given chunk based on z-score.
    The basic idea is to determine mean/standard deviation value to calculate the z-score.
    The z-score tells us how far the measured value is far from the mean value (how many standard deviations)
    If the corresponding z-score is higher than 2-3 time the standard deviation we count the measure as an anomaly.
    This technique is usefull when we can notice errors in one column.
    

    """
    
    mean = chunk[column_name].mean()
    std_dev = chunk[column_name].std()

    
    # Calculate the z-scores
    chunk['z_score'] = (chunk[column_name] - mean) / std_dev

    # Filter rows where absolute z-score is greater than the threshold
    anomalies = chunk[chunk['z_score'].abs() > threshold]
    
    return anomalies, mean, std_dev

def write_correct_informations(output_file,name,mean,std_dev,len_data,len_anomalies,ratio_of_issues,treshold_min,treshold_max,veh_id,probabilty_to_appears):
    
    with open(f'../Adel_temp/{output_file}', 'a') as file:
        file.write(f"column_name ({name}) with a threshold ({2}): \n\t-The mean value: {mean} \n\t-The standard deviation: {std_dev}\n")
        file.write(f"Here's the setup: \n\t-data: {len_data} \n\t-anomalies: {len_anomalies} \n\t-ratio: {(ratio_of_issues):.2f}% \n\t-treshold min: {treshold_min} \n\t-treshold max: {treshold_max}\n")

        file.write(f"\t\t-Vehicle ID {veh_id} appeared {(probabilty_to_appears):.2f}% of times in the anomalie {name}.\n")
        file.write('______'*10 + "\n\n")

def clean_files(file_name_list):
    for file in file_name_list:
        with open(f'../Adel_temp/{file}', 'w') as file:
            file.write("") #Just to clean the file

def interprete_data_and_write(data,treshold_min,treshold_max):
    # Open the file for writing
    
    current_all_columns = [
        "mapped_veh_id",
        "lat", "lon",
        "RS_E_OilPress_PC1", "RS_E_OilPress_PC2",
        "RS_E_RPM_PC1", "RS_E_RPM_PC2",
        "RS_E_InAirTemp_PC1", "RS_E_InAirTemp_PC2",
        "RS_E_WatTemp_PC1", "RS_E_WatTemp_PC2",
        "RS_T_OilTemp_PC1", "RS_T_OilTemp_PC2"
    ]

    file_name_list = ['0_under_treshold.txt','1_between_treshold.txt','2_over_treshold.txt']
    clean_files(file_name_list)

    #treshold of trains that got issues knowing we get issues
        #treshold_min = 0.05
        #treshold_max = 0.1

    for name in current_all_columns:
        anomalies, mean, std_dev = z_score_anomaly_detection(data, name)
        ratio_of_issues = len(anomalies)/len(data)

        # Count occurrences of each unique mapped_veh_id
        veh_id_counts = anomalies['mapped_veh_id'].value_counts()
        

        # Write the counts of each mapped_veh_id to the file
        for veh_id, count in veh_id_counts.items():
            probabilty_to_appears = count/(len(anomalies))

            if probabilty_to_appears< treshold_min:
                output_file = file_name_list[0]
            elif treshold_min <= probabilty_to_appears < treshold_max:
                output_file = file_name_list[1]
            else:
                output_file = file_name_list[2]

            write_correct_informations(output_file,name,mean,std_dev,len(data),len(anomalies),ratio_of_issues,treshold_min,treshold_max,veh_id,probabilty_to_appears)

treshold_min = 0.05
treshold_max = 0.1
interprete_data_and_write(data,treshold_min,treshold_max)

In [None]:
#!pip install plotly
#!pip install nbformat>=4.2.0

In [None]:


#_______________MAD (Multivariate Anomaly Detection):
"""
This looks at combinations of features to detect anomalies.

    -PCA (Principal Component Analysis): Transform data into principal components and then analyze residuals (difference between original and reconstructed data) to detect 
        anomalies.
    -Clustering (e.g., K-means): Data points that are far from cluster centers can be considered anomalies.


"""

#_______________TSAD (Time-Series Anomaly Detection):
"""
In the context of trains/metros, we might want to consider methods designed for such data.
    -Exponential Smoothing, ARIMA: Model the time-series and then check for data points that deviate significantly from the predicted values.
    -Moving Average: Calculate the moving average and detect points that deviate from this average by more than a predefined threshold.
"""




'\nIn the context of trains/metros, we might want to consider methods designed for such data.\n    -Exponential Smoothing, ARIMA: Model the time-series and then check for data points that deviate significantly from the predicted values.\n    -Moving Average: Calculate the moving average and detect points that deviate from this average by more than a predefined threshold.\n'