# Code to utilize the downloaded datasets with Python

First, run the download functions in order to process the data into the required format.

### Notes

Make sure you have enough working memory if you are injecting a large amount of data into a single dataframe. Furthermore, .ipynb kernels terminate rather easily, so larger amounts of data should be processed in .py environments.

## Paderborn


All bearing codes = [
        "K001",
        "K002",
        "K003",
        "K004",
        "K005",
        "K006",
        "KA01",
        "KA03",
        "KA04",
        "KA05",
        "KA06",
        "KA07",
        "KA08",
        "KA09",
        "KA15",
        "KA16",
        "KA22",
        "KA30",
        "KB23",
        "KB24",
        "KB27",
        "KI01",
        "KI03",
        "KI04",
        "KI05",
        "KI07",
        "KI08",
        "KI14",
        "KI16",
        "KI17",
        "KI18",
        "KI21"
    ]




Consult the original publication for precise fault information for each setting and brg_code: https://mb.uni-paderborn.de/fileadmin-mb/kat/PDF/Veroeffentlichungen/20160703_PHME16_CM_bearing.pdf 

The 'setting' column is essentially the name of the file the data was extracted from. It can be used to differentiate measurements. 

In [19]:
import pandas as pd
from pathlib import Path

def load_paderborn(args, base_dir="data/Paderborn"):
    '''
    Searches the four chunk files generated by running paderborn.py twice for all data with the given brg_code and setting.
    Parameters:
        args (dict): Dictionary where keys are column names and values are lists of accepted values. Supported keys: 'brg_code', 'setting'
        base_dir (str): Directory containing Paderborn parquet chunk files.

    Returns: 
        Pandas dataframe with found data

    '''
    if not args or not isinstance(args, dict):
        raise ValueError("args must be a non-empty dictionary with column filters.")
    base_path = Path(base_dir)
    result_dfs = []
    
    for chunk_dir in sorted(base_path.glob("*_chunk_*")):
        print(f"Scanning {chunk_dir}")
        try:
            df = pd.read_parquet(chunk_dir)
            print({col: df[col].unique() for col in df.columns}) 
            condition = pd.Series([True] * len(df))
            
            for col, valid_values in args.items():
                if col in df.columns:
                    condition &= df[col].isin(valid_values)
                else:
                    print(f"Warning: Column '{col}' not found in data. Ignoring this filter.")
            
            filtered = df[condition]
            print(filtered)
            if not filtered.empty:
                result_dfs.append(filtered)
        except Exception as e:
            print(f"Skipping {chunk_dir}: {e}")

    if result_dfs:
        return pd.concat(result_dfs, ignore_index=True)
    else:
        print(f"No data found")
        return pd.DataFrame()
filters={ 
        "brg_code":["KI05"],
        "setting":["N15_M01_F10_KI05_20"]
    }

df = load_paderborn(filters)
print(df.shape)
print(df.head())

Scanning data/Paderborn/Paderborn_chunk_1.parquet


In [10]:
print(df.shape)
print(df.head())

(20298269, 10)
       time        force  phase_current_1  phase_current_2        speed  \
0  0.000000  1105.180273         1.200547        -0.732864  1499.793664   
1  0.000015  1129.594335         1.296976        -0.811385  1499.793981   
2  0.000031  1115.556249         1.128224        -0.843758  1499.794297   
3  0.000047  1103.349218         1.234297        -0.998045  1499.794613   
4  0.000062  1125.932226         1.215700        -0.675695  1499.794930   

   temp_2_bearing_module    torque  vibration_1 brg_code              setting  
0              43.389893  0.612224    -0.088501     KI05  N15_M01_F10_KI05_20  
1              43.496704  0.591323    -0.094604     KI05  N15_M01_F10_KI05_20  
2              43.572998  0.563421    -0.079346     KI05  N15_M01_F10_KI05_20  
3              43.597412  0.569828     0.494385     KI05  N15_M01_F10_KI05_20  
4              43.545532  0.588976     0.064087     KI05  N15_M01_F10_KI05_20  


### NLN-EMP

m_type: Electric, Vibration


In [6]:
import pandas as pd
from pathlib import Path
def load_nln_emp(args: dict, base_dir="data/NLN-EMP"):
    '''
    Searches the chunk files generated by running nln_emp.py for all data matching given filter conditions.

    Parameters:
        args (dict): Dictionary where keys are column names and values are lists of accepted values.
                     Supported keys: 'm_type', 'n_poles', 'class', 'severity', 'speed', 
        base_dir (str): Directory containing NLN-EMP parquet chunk files.

    Returns:
        pd.DataFrame: Concatenated dataframe containing the filtered data across all chunk files.
    '''
    if not args or not isinstance(args, dict):
        raise ValueError("args must be a non-empty dictionary with column filters.")

    base_path = Path(base_dir)
    result_dfs = []
    print(base_path.glob("*_chunk_*"))
    for chunk_file in sorted(base_path.glob("*_chunk_*")):
        print(f"Scanning {chunk_file}")
        try:
            df = pd.read_parquet(chunk_file)
            df = df.copy()  # avoid SettingWithCopyWarning
            for col in df.select_dtypes(include="object").columns:
                df[col] = df[col].str.strip()
            print({col: df[col].unique() for col in args.keys() if col in df.columns})
            #print(df.head(3))
            condition = pd.Series([True] * len(df))
            
            for col, valid_values in args.items():
                if col in df.columns:
                    condition &= df[col].isin(valid_values)
                else:
                    print(f"Warning: Column '{col}' not found in data. Ignoring this filter.")
            
            filtered = df[condition]
            #print(filtered)
            if not filtered.empty:
                result_dfs.append(filtered)
        except Exception as e:
            print(f"Skipping {chunk_file}: {e}")

    if result_dfs:
        return pd.concat(result_dfs, ignore_index=True)
    else:
        print("No data found")
        return pd.DataFrame()
filters = {
    "n_poles": [2],
    "m_type":["Electric"],
    "class": ["healthy noise"],
    "channel":[1],
}

df = load_nln_emp(filters)
#print(df.shape)
print(df.head())

<generator object Path.glob at 0x7fcdd0a313c0>
Scanning data/NLN-EMP/NLN_EMP_chunk_1.parquet
{'n_poles': array([4, 2]), 'm_type': array(['Vibration'], dtype=object), 'class': array(['align parallel', 'cavitation suction', 'align combination',
       'healthy noise', 'coupling', 'align angular', 'unbalance pump',
       'unbalance motor', 'healthy', 'cavitation discharge',
       'coupling 2D', 'bent shaft', 'broken rotor bar', 'impeller',
       'soft foot', 'bearing pump', 'bearing contaminated', 'new motor',
       'bearing bpfo', 'stator short', 'bearing bpfi', 'bearing bsf',
       'loose foot pump', 'loose foot motor'], dtype=object), 'channel': array([1, 2, 3, 4, 5])}
Scanning data/NLN-EMP/NLN_EMP_chunk_2.parquet
{'n_poles': array([2, 4]), 'm_type': array(['Vibration', 'Electric'], dtype=object), 'class': array(['stator short', 'bearing bpfo', 'healthy', 'bearing bpfi',
       'broken rotor bar', 'impeller', 'soft foot', 'bearing pump',
       'healthy noise', 'bearing contaminat

### ASD

In [9]:
import pandas as pd
from pathlib import Path

def load_ASD(args: dict, base_dir="data/ASD"):
    '''
    Loads filtered data from ASD_download.parquet file.

    Parameters: 
        args (dict): Dictionary with filter conditions. Supported keys: 'class', 'rpm'.
        base_dir (str): Directory containing ASD files.

    Returns:
        pd.DataFrame: Filtered concatenated data.
    '''
    if not args or not isinstance(args, dict):
        raise ValueError("args must be a non-empty dictionary with filter values.")

    base_path = Path(base_dir)
    result_dfs = []

    file = next(base_path.glob("*.parquet"), None)

    if file is None:
        raise FileNotFoundError(f"No .parquet file found in {base_dir}")
    print(f"Scanning {file}")
    try:
        df = pd.read_parquet(file)
        print({col: df[col].unique() for col in args.keys() if col in df.columns})
        
        for col in df.select_dtypes(include="object").columns:
            df[col] = df[col].str.strip()

        condition = pd.Series([True] * len(df))
        for col, values in args.items():
            if col in df.columns:
                condition &= df[col].isin(values)
            else:
                print(f"Warning: Column '{col}' not found in chunk, skipping filter.")

        filtered = df[condition]
        if not filtered.empty:
            result_dfs.append(filtered)

    except Exception as e:
        print(f"Skipping {file}: {e}")

    if result_dfs:
        return pd.concat(result_dfs, ignore_index=True)
    else:
        print("No data found")
        return pd.DataFrame()
    
filters = {
    "class": ["1"],
    "rpm": ["1000"]
}

df = load_ASD(filters)
print(df.shape)
print(df.head(3))

Scanning data/ASD/ASD_downloaded.parquet
{'class': <StringArray>
['1', '3', '2', '6', '7', '5', '4', '0', '9', '8']
Length: 10, dtype: string, 'rpm': <StringArray>
['1000', '750', '1500', '500', '250', '1250']
Length: 6, dtype: string}
(451800, 14)
       time    enc1_ang    enc2_ang   enc3_ang   enc4_ang     enc5_ang  \
0  0.000000  13961988.0  13961988.0  4653996.0 -4653996.0  1163498.875   
1  0.000332  13961990.0  13961990.0  4653996.5 -4653996.5  1163499.125   
2  0.000664  13961992.0  13961992.0  4653997.0 -4653997.0  1163499.250   

       acc1      acc2      acc3      acc4     Torq1     Torq2 class   rpm  
0  1.837953  0.695454  1.873710  1.873710  3.973999  0.942261     1  1000  
1 -0.786764 -6.515445 -4.137097 -4.137097  3.989563  0.722656     1  1000  
2 -1.084027  0.072936 -0.497809 -0.497809  3.995972  1.012451     1  1000  


### AGFD

In [4]:
import pandas as pd
from pathlib import Path

def load_AGFD(args: dict, base_dir="data/AGFD"):
    '''
    Loads filtered data from AGFD_download.parquet file.

    Parameters: 
        args (dict): Dictionary with filter conditions. Supported keys: 'class', 'rpm','severity','rpm','torque','installation','healthy_GP.
        base_dir (str): Directory containing AGFD parquet file.

    Returns:
        pd.DataFrame: Filtered concatenated data.
    '''
    if not args or not isinstance(args, dict):
        raise ValueError("args must be a non-empty dictionary with filter values.")

    base_path = Path(base_dir)
    result_dfs = []

    print(base_path.glob("*_chunk_*"))
    for chunk_file in sorted(base_path.glob("*_chunk_*")):
        print(f"Scanning {chunk_file}")
        try:
            df = pd.read_parquet(chunk_file)
            df = df.copy()  
            for col in df.select_dtypes(include="object").columns:
                df[col] = df[col].str.strip()
            print({col: df[col].unique() for col in args.keys() if col in df.columns}) # Check all possible parameters
            
            condition = pd.Series([True] * len(df))
            
            for col, valid_values in args.items():
                if col in df.columns:
                    condition &= df[col].isin(valid_values)
                else:
                    print(f"Warning: Column '{col}' not found in data. Ignoring this filter.")
            
            filtered = df[condition]
            
            if not filtered.empty:
                result_dfs.append(filtered)
        except Exception as e:
            print(f"Skipping {chunk_file}: {e}")

    if result_dfs:
        return pd.concat(result_dfs, ignore_index=True)
    else:
        print("No data found")
        return pd.DataFrame()
    
filters = {
    "class": ["healthy"],
    "rpm": [1000],
    "healthy_GP":[9],
    "torque":[6]


}

df = load_AGFD(filters)
print(df.shape)
print(df.head(3))

<generator object Path.glob at 0x7f7d19731cf0>
Scanning data/AGFD/AGFD_chunk_1.parquet
{'class': array(['healthy', 'crack', 'pitting', 'wear', 'micropitting'],
      dtype=object), 'rpm': array([1000,  750]), 'healthy_GP': array([9, 3, 2, 1, 5, 4, 6, 7, 0]), 'torque': array([ 6, 11,  1])}
Scanning data/AGFD/AGFD_chunk_2.parquet
{'class': array(['crack', 'micropitting', 'healthy', 'pitting', 'wear'],
      dtype=object), 'rpm': array([ 750, 1500,  500]), 'healthy_GP': array([0, 9, 3, 2, 1, 5, 4, 6, 7]), 'torque': array([ 1,  6, 11])}
Scanning data/AGFD/AGFD_chunk_3.parquet
{'class': array(['wear', 'micropitting', 'crack', 'healthy', 'pitting'],
      dtype=object), 'rpm': array([ 500,  250, 1250]), 'healthy_GP': array([0, 9, 3, 2, 1, 5, 4, 6, 7]), 'torque': array([ 1,  6, 11])}
(331326, 18)
       time  enc1_ang  enc2_ang  enc3_ang  enc4_ang  enc5_ang       acc1  \
0  0.000000     0.000     0.000     0.000    -0.000     0.000  14.651109   
1  0.000332     1.998     2.034     0.648     0

### CWRU

```
measurement location fault location fault type  fault depth  \
0                   DE             DE         OR           14   
1                   FE             DE         OR           14   
2                   BA             DE         OR           14   
3                   DE             FE          B           14   
4                   FE             FE          B           14   

  fault orientation  sampling rate  torque tags  \
0                 C             48       1   []   
1                 C             48       1   []   
2                 C             48       1   []   
3                 -             12       1   []   
4                 -             12       1   []   
```



In [8]:
import pandas as pd
from pathlib import Path

def load_CWRU(args: dict, base_dir="data/CWRU"):
    '''
    Loads filtered data from AGFD_download.parquet file.

    Parameters: 
        args (dict): Dictionary with filter conditions. Supported keys: 'class', 'rpm','severity','rpm','torque','installation','healthy_GP.
        base_dir (str): Directory containing AGFD parquet file.

    Returns:
        pd.DataFrame: Filtered concatenated data.
    '''
    if not args or not isinstance(args, dict):
        raise ValueError("args must be a non-empty dictionary with filter values.")

    base_path = Path(base_dir)
    result_dfs = []

    file = next(base_path.glob("*.parquet"), None)

    if file is None:
        raise FileNotFoundError(f"No .parquet file found in {base_dir}")
    print(f"Scanning {file}")
    try:
        df = pd.read_parquet(file)
        print({col: df[col].unique() for col in args.keys() if col in df.columns})
        
        for col in df.select_dtypes(include="object").columns:
            df[col] = df[col].str.strip()

        condition = pd.Series([True] * len(df))
        for col, values in args.items():
            if col in df.columns:
                condition &= df[col].isin(values)
            else:
                print(f"Warning: Column '{col}' not found in chunk, skipping filter.")

        filtered = df[condition]
        if not filtered.empty:
            result_dfs.append(filtered)

    except Exception as e:
        print(f"Skipping {file}: {e}")

    if result_dfs:
        return pd.concat(result_dfs, ignore_index=True)
    else:
        print("No data found")
        return pd.DataFrame()
    
filters = {
    "fault type": ["OR"],
    "fault depth":[14],
    "torque":[1]


}

df = load_CWRU(filters)
print(df.shape)
print(df.head(3))

Scanning data/CWRU/CWRU_downloaded.parquet
{'fault type': <StringArray>
['OR', 'B', 'IR', 'normal']
Length: 4, dtype: string, 'fault depth': array([14,  7, 21, 28,  0]), 'torque': array([1, 0, 3, 2])}
(1698878, 10)
         measurement_id  sample_index  measurement measurement location  \
0  48k_DE_OR-C_014_1_DE             0     0.133305                   DE   
1  48k_DE_OR-C_014_1_DE             1     0.152498                   DE   
2  48k_DE_OR-C_014_1_DE             2     0.166475                   DE   

  fault location fault type  fault depth fault orientation  sampling rate  \
0             DE         OR           14                 C             48   
1             DE         OR           14                 C             48   
2             DE         OR           14                 C             48   

   torque  
0       1  
1       1  
2       1  
