In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import os
from collections import Counter
from imblearn.over_sampling import SMOTE
from datetime import datetime
from sklearn.decomposition import PCA

In [12]:
path = "Data/train"
os.listdir(path)
df1 = pd.DataFrame()
for file in os.listdir(path):
    if file.endswith('.csv'):
        df = pd.read_csv(path + "/"+ file)

        df.drop(['Filament'], axis=1, inplace=True) # Drop Filament column
        df.drop(['Emission'], axis=1, inplace=True) # Drop Emission column
        df.drop(['Temp'], axis=1, inplace=True) # Drop Temp column
        df.drop(['Test ID'], axis=1, inplace=True) # Drop Test ID column

        df['502_to_69_ratio'] = df['502 m/z'] / df['69 m/z'] # Create new column with ratio of 502 m/z to 69 m/z

        ############################################
        #clean the datetime format
        def clean_date(date):
            if 'UTC' in date:
                date = date.split('(')[0]
                date = date.split("  ")[0]
            return date

        def convert_to_datetime(date):
            date_string = date
            date_format = '%d/%m/%Y %I:%M %p'

            datetime_obj = datetime.strptime(date_string, date_format)
            formatted_datetime = datetime_obj.strftime('%Y-%m-%d %H:%M:%S')
            return formatted_datetime

        df['Tune Date'] = df['Tune Date'].apply(lambda x: clean_date(x))
        if 'UTC' in df['Tune Date'][0]:
            df['Tune Date'] = df['Tune Date'].apply(lambda x: convert_to_datetime(x))
            
        df['Tune Date'] = pd.to_datetime(df['Tune Date'])
        df['total_hour_diff'] = df['Tune Date'].diff().apply(lambda x: x.total_seconds()/3600) # new column with total hour difference

        df.drop(['Tune Date'], axis=1, inplace=True) # Drop Tune Date column
        ############################################

        # Perform PCA on the highly correlated columns
        pca = PCA(n_components=1)
        components = pca.fit_transform(df[['69 m/z', '70 m/z']])
        # print(pca.explained_variance_ratio_)
        df['PCA_Component 69 m/z'] = components

        pca = PCA(n_components=1)
        components = pca.fit_transform(df[['219 m/z', '220 m/z']])
        # print(pca.explained_variance_ratio_)
        df['PCA_Component 219 m/z'] = components

        pca = PCA(n_components=1)
        components = pca.fit_transform(df[['502 m/z', '503 m/z']])
        # print(pca.explained_variance_ratio_)
        df['PCA_Component 502 m/z'] = components

        # add columns for pct change
        def precent_change(col_name , periods):
            df[col_name + " pct change " + str(periods) + 'days'] = df[col_name].pct_change(periods=periods) * 100
            return df[col_name + " pct change " + str(periods) + 'days']

        for i in range (5):
            precent_change('EM Volts', i+1)
        for i in range (5):
            precent_change('502_to_69_ratio', i+1)

        df1 = pd.concat([df1,df])

df1.fillna(0, inplace=True) # fill all NaN values with 0

# # Perform one-hot encoding
# one_hot_encoded = pd.get_dummies(df1['System'], prefix='System')
# # Concatenate the one-hot encoded DataFrame with the original DataFrame
# df1 = pd.concat([df1, one_hot_encoded], axis=1)
#drop System column
df1.drop(['System'], axis=1, inplace=True)
df1.drop(['ID'], axis=1, inplace=True)
df1

Unnamed: 0,69 m/z,70 m/z,219 m/z,220 m/z,502 m/z,503 m/z,Amu gain,Amu offset,EM Volts,Ent Lens,...,EM Volts pct change 1days,EM Volts pct change 2days,EM Volts pct change 3days,EM Volts pct change 4days,EM Volts pct change 5days,502_to_69_ratio pct change 1days,502_to_69_ratio pct change 2days,502_to_69_ratio pct change 3days,502_to_69_ratio pct change 4days,502_to_69_ratio pct change 5days
0,389888,4815,352320,15620,24096,2770,2503,127.000,1718,7.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,322880,3532,359616,15261,27592,2709,2496,126.000,1812,12.00,...,5.471478,0.000000,0.000000,0.000000,0.000000,38.272862,0.000000,0.000000,0.000000,0.000000
2,316608,3490,379840,16608,29992,3163,2491,126.000,1718,12.00,...,-5.187638,0.000000,0.000000,0.000000,0.000000,10.851483,53.277517,0.000000,0.000000,0.000000
3,296128,3154,368064,15829,28832,2954,2491,125.000,1765,12.00,...,2.735739,-2.593819,2.735739,0.000000,0.000000,2.780743,13.933977,57.539771,0.000000,0.000000
4,300544,2958,375296,16720,29136,2995,2490,125.000,1812,12.00,...,2.662890,5.471478,0.000000,5.471478,0.000000,-0.430444,2.338329,13.443555,56.861651,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,335808,3349,290176,13034,29240,2957,1890,117.875,1568,20.18,...,0.000000,0.000000,0.000000,0.000000,0.000000,-2.039631,-1.242635,-0.026647,3.032114,1.039499
69,363776,4016,314688,13293,29592,3278,1891,117.813,1568,20.18,...,0.000000,0.000000,0.000000,0.000000,0.000000,-6.576971,-8.482456,-7.737878,-6.601865,-3.744278
70,353280,4184,311808,12936,31808,3364,1890,118.063,1568,22.67,...,0.000000,0.000000,0.000000,0.000000,0.000000,10.682010,3.402486,1.293457,2.117571,3.374933
71,492288,5296,423104,18760,44760,4126,1889,117.875,1615,20.18,...,2.997449,2.997449,2.997449,2.997449,2.997449,0.984220,11.771365,4.420195,2.290407,3.122633


In [13]:
path = "Data/validation"
val = pd.DataFrame()
for file in os.listdir(path):
    if file.endswith('.csv'):
        df = pd.DataFrame()
        df = pd.read_csv(path + "/"+ file)

        df.drop(['Filament'], axis=1, inplace=True) # Drop Filament column
        df.drop(['Emission'], axis=1, inplace=True) # Drop Emission column
        df.drop(['Temp'], axis=1, inplace=True) # Drop Temp column
        df.drop(['Test ID'], axis=1, inplace=True) # Drop Test ID column

        df['502_to_69_ratio'] = df['502 m/z'] / df['69 m/z'] # Create new column with ratio of 502 m/z to 69 m/z

        ############################################
        #clean the datetime format
        def clean_date(date):
            if 'UTC' in date:
                date = date.split('(')[0]
                date = date.split("  ")[0]
            return date

        def convert_to_datetime(date):
            date_string = date
            date_format = '%d/%m/%Y %I:%M %p'

            datetime_obj = datetime.strptime(date_string, date_format)
            formatted_datetime = datetime_obj.strftime('%Y-%m-%d %H:%M:%S')
            return formatted_datetime

        df['Tune Date'] = df['Tune Date'].apply(lambda x: clean_date(x))
        if 'UTC' in df['Tune Date'][0]:
            df['Tune Date'] = df['Tune Date'].apply(lambda x: convert_to_datetime(x))
            
        df['Tune Date'] = pd.to_datetime(df['Tune Date'])
        df['total_hour_diff'] = df['Tune Date'].diff().apply(lambda x: x.total_seconds()/3600) # new column with total hour difference

        df.drop(['Tune Date'], axis=1, inplace=True) # Drop Tune Date column
        ############################################

        # Perform PCA on the highly correlated columns
        pca = PCA(n_components=1)
        components = pca.fit_transform(df[['69 m/z', '70 m/z']])
        print(pca.explained_variance_ratio_)
        df['PCA_Component 69 m/z'] = components

        pca = PCA(n_components=1)
        components = pca.fit_transform(df[['219 m/z', '220 m/z']])
        print(pca.explained_variance_ratio_)
        df['PCA_Component 219 m/z'] = components

        pca = PCA(n_components=1)
        components = pca.fit_transform(df[['502 m/z', '503 m/z']])
        print(pca.explained_variance_ratio_)
        df['PCA_Component 502 m/z'] = components


        # add columns for pct change
        def precent_change(col_name , periods):
            df[col_name + " pct change " + str(periods) + 'days'] = df[col_name].pct_change(periods=periods) * 100
            return df[col_name + " pct change " + str(periods) + 'days']

        for i in range (5):
            precent_change('EM Volts', i+1)
        for i in range (5):
            precent_change('502_to_69_ratio', i+1)

        val = pd.concat([val,df])
val.fillna(0, inplace=True) # fill all NaN values with 0
val

# Perform one-hot encoding
# one_hot_encoded = pd.get_dummies(val['System'], prefix='System')
# # Concatenate the one-hot encoded DataFrame with the original DataFrame
# val = pd.concat([val, one_hot_encoded], axis=1)
#drop System column
val.drop(['System'], axis=1, inplace=True)
val.drop(['ID'], axis=1, inplace=True)
val

[0.99998532]
[0.99989338]
[0.99922642]
[0.99999016]
[0.99993275]
[0.9994708]
[0.99996794]
[0.99986354]
[0.9972351]


Unnamed: 0,69 m/z,70 m/z,219 m/z,220 m/z,502 m/z,503 m/z,Amu gain,Amu offset,EM Volts,Ent Lens,...,EM Volts pct change 1days,EM Volts pct change 2days,EM Volts pct change 3days,EM Volts pct change 4days,EM Volts pct change 5days,502_to_69_ratio pct change 1days,502_to_69_ratio pct change 2days,502_to_69_ratio pct change 3days,502_to_69_ratio pct change 4days,502_to_69_ratio pct change 5days
0,335488,3872,436800,19576,39472,4181,2494,127.000,1812,12.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,401152,4621,534400,23064,47256,4663,2492,127.000,1859,12.00,...,2.593819,0.000000,0.000000,0.000000,0.000000,0.123461,0.000000,0.000000,0.000000,0.000000
2,299008,3565,460928,20112,54344,5414,2492,126.000,1718,14.50,...,-7.584723,-5.187638,0.000000,0.000000,0.000000,54.283967,54.474448,0.000000,0.000000,0.000000
3,298496,3332,430400,18936,53944,5389,2493,126.000,1812,16.50,...,5.471478,-2.528241,0.000000,0.000000,0.000000,-0.565788,53.411047,53.600450,0.000000,0.000000
4,309248,3621,401408,18520,52232,5294,2492,125.000,1812,12.00,...,0.000000,5.471478,-2.528241,0.000000,0.000000,-6.540140,-7.068925,43.377750,43.554766,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,487104,5374,507584,21432,58024,5526,1895,116.563,1568,22.67,...,3.090072,3.090072,3.090072,3.090072,3.090072,-0.313327,8.059909,15.859816,4.212396,-3.339434
80,351232,4015,358144,14609,38776,4284,1894,116.750,1521,22.67,...,-2.997449,0.000000,0.000000,0.000000,0.000000,-7.320654,-7.611043,0.149218,7.378120,-3.416632
81,340544,3444,343488,16313,38920,3576,1894,116.438,1521,22.67,...,0.000000,-2.997449,0.000000,0.000000,0.000000,3.521527,-4.056926,-4.357541,3.675999,11.159470
82,447808,4965,433344,19664,49248,4555,1891,116.813,1568,22.67,...,3.090072,3.090072,0.000000,3.090072,3.090072,-3.772955,-0.384293,-7.676814,-7.966088,-0.235649


In [14]:
X_train = df1.drop(['Condition'], axis=1)
y_train = df1['Condition']
X_val = val.drop(['Condition'], axis=1)
y_val = val['Condition']


In [15]:
X_val.shape, X_train.shape

((179, 29), (525, 29))

In [16]:
X_train.columns

Index(['69 m/z', '70 m/z', '219 m/z', '220 m/z', '502 m/z', '503 m/z',
       'Amu gain', 'Amu offset', 'EM Volts', 'Ent Lens', 'Ion Focus',
       'Repeller', 'Ent Lens offset', 'Width219', '502_to_69_ratio',
       'total_hour_diff', 'PCA_Component 69 m/z', 'PCA_Component 219 m/z',
       'PCA_Component 502 m/z', 'EM Volts pct change 1days',
       'EM Volts pct change 2days', 'EM Volts pct change 3days',
       'EM Volts pct change 4days', 'EM Volts pct change 5days',
       '502_to_69_ratio pct change 1days', '502_to_69_ratio pct change 2days',
       '502_to_69_ratio pct change 3days', '502_to_69_ratio pct change 4days',
       '502_to_69_ratio pct change 5days'],
      dtype='object')

In [17]:
X_val.columns

Index(['69 m/z', '70 m/z', '219 m/z', '220 m/z', '502 m/z', '503 m/z',
       'Amu gain', 'Amu offset', 'EM Volts', 'Ent Lens', 'Ion Focus',
       'Repeller', 'Ent Lens offset', 'Width219', '502_to_69_ratio',
       'total_hour_diff', 'PCA_Component 69 m/z', 'PCA_Component 219 m/z',
       'PCA_Component 502 m/z', 'EM Volts pct change 1days',
       'EM Volts pct change 2days', 'EM Volts pct change 3days',
       'EM Volts pct change 4days', 'EM Volts pct change 5days',
       '502_to_69_ratio pct change 1days', '502_to_69_ratio pct change 2days',
       '502_to_69_ratio pct change 3days', '502_to_69_ratio pct change 4days',
       '502_to_69_ratio pct change 5days'],
      dtype='object')

In [18]:
X_train.columns

Index(['69 m/z', '70 m/z', '219 m/z', '220 m/z', '502 m/z', '503 m/z',
       'Amu gain', 'Amu offset', 'EM Volts', 'Ent Lens', 'Ion Focus',
       'Repeller', 'Ent Lens offset', 'Width219', '502_to_69_ratio',
       'total_hour_diff', 'PCA_Component 69 m/z', 'PCA_Component 219 m/z',
       'PCA_Component 502 m/z', 'EM Volts pct change 1days',
       'EM Volts pct change 2days', 'EM Volts pct change 3days',
       'EM Volts pct change 4days', 'EM Volts pct change 5days',
       '502_to_69_ratio pct change 1days', '502_to_69_ratio pct change 2days',
       '502_to_69_ratio pct change 3days', '502_to_69_ratio pct change 4days',
       '502_to_69_ratio pct change 5days'],
      dtype='object')

In [19]:
# apply standard scalar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [10]:
# apply over sampling
from imblearn.over_sampling import SMOTE
from collections import Counter

sm = SMOTE(random_state=42)
print('Original dataset shape %s' % Counter(y_train))
X_train, y_train = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train))


Original dataset shape Counter({0: 507, 1: 18})
Resampled dataset shape Counter({0: 507, 1: 507})


In [21]:
# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("precision_score:", precision_score(y_val, y_pred))
print("recall_score:", recall_score(y_val, y_pred))
print("f1_score:", f1_score(y_val, y_pred))
print('roc_auc_score', roc_auc_score(y_val, y_pred))


Accuracy: 0.9664804469273743
precision_score: 0.0
recall_score: 0.0
f1_score: 0.0
roc_auc_score 0.5


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# svc
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("precision_score:", precision_score(y_val, y_pred))
print("recall_score:", recall_score(y_val, y_pred))
print("f1_score:", f1_score(y_val, y_pred))
print('roc_auc_score', roc_auc_score(y_val, y_pred))


Accuracy: 0.9664804469273743
precision_score: 0.0
recall_score: 0.0
f1_score: 0.0
roc_auc_score 0.5


  _warn_prf(average, modifier, msg_start, len(result))
