# IndustriALL AI Challenge

---


### Reading Data


In [26]:
from multiprocessing import cpu_count
import os
import time
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

DATAFOLDER = '../data/'
BASE_NAME = 'TAG_iALL_PS_'

In [15]:
full_csv_exists = 'full.csv' in os.listdir(DATAFOLDER)

In [16]:
if full_csv_exists:
    start_time = time.time()
    full_df = pd.read_csv('../data/full.csv')
    end_time = time.time()
    print('Elapsed time: ' + str(end_time - start_time) + ' seconds')

else:
    start_time = time.time()
    def read(file):
        if file.endswith('00.csv'):
            return None

        print('Reading file: ' + file)
        df = pd.read_csv(DATAFOLDER + file)
        df = df.set_index('timestamp')

        return df[file.split('.')[0]]

    if __name__ == '__main__':
        # Get the list of files to process
        files_to_process = [file for file in os.listdir(DATAFOLDER) if not file.endswith('00.csv')]

        # Use ProcessPoolExecutor for parallel processing
        with ProcessPoolExecutor(cpu_count()) as executor:
            # Map the function to process each file in parallel
            dfs = list(executor.map(read, files_to_process))

        # Create the full_df by concatenating the DataFrames
        full_df = pd.concat([pd.read_csv(DATAFOLDER + 'TAG_iALL_PS_00.csv').set_index('timestamp')] + dfs, ignore_index=False, axis=1)

        # Rename the 'target_iALL_PS' column to 'status' 
        if 'target_iALL_PS.csv' in files_to_process:
            full_df = full_df.rename(columns={'target_iALL_PS': 'status'})

        # Save the result to a CSV file
        full_df.to_csv(DATAFOLDER + 'full.csv')
        end_time = time.time()
        print('Elapsed time: ' + str(end_time - start_time) + ' seconds')
    

Elapsed time: 2.292546272277832 seconds


---

### Processing Data

In [34]:
full_df = full_df.drop_duplicates()


# build dataframes with only ANORMAL status
anormal_df = full_df[full_df['status'] == 'ANORMAL']
print(anormal_df.shape)
print(anormal_df.describe())
# print(anormal_df.head())
# print(anormal_df.tail())

print(full_df['status'].value_counts())
print(full_df['status'].value_counts(normalize=True))

# print(full_df.head())
# print(full_df.tail(n=20))
# print(full_df.shape)
# print(full_df.columns)
# print(full_df.describe())

(14484, 54)
       TAG_iALL_PS_00  TAG_iALL_PS_01  TAG_iALL_PS_02  TAG_iALL_PS_03  \
count     4290.000000    14454.000000    14479.000000    14479.000000   
mean         2.415374       86.770994       90.787765       81.811783   
std          2.355662       44.461730       48.586422       41.593146   
min         -5.793448      -69.294665     -111.724200      -92.964574   
25%          0.792655       57.346602       57.807068       53.556725   
50%          2.410244       86.918351       90.736031       81.445123   
75%          4.020684      116.691624      124.255120      110.041727   
max          9.705992      250.501402      260.798301      231.295301   

       TAG_iALL_PS_04  TAG_iALL_PS_05  TAG_iALL_PS_06  TAG_iALL_PS_07  \
count    14479.000000    14479.000000     9700.000000     9047.000000   
mean       683.576051      118.238128       19.469562       24.379578   
std        600.775079       82.068623       13.910594       17.005007   
min      -1427.411597     -170.892724 