In [None]:
!pip uninstall -y transformers
!pip install transformers==4.44.0 #for compatibility with the base model

In [None]:
import numpy as np
import pandas as pd
import os
import requests

import zipfile
import glob
from scipy.io import loadmat # To read .mat files

import scipy.io
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
from urllib3.exceptions import InsecureRequestWarning

import accelerate
import torch
import transformers
from datasets import load_dataset, Dataset, load_from_disk, concatenate_datasets, Features, Value, Sequence, ClassLabel
import datasets

from torch.utils.tensorboard import SummaryWriter
from transformers.integrations import TensorBoardCallback

from google.colab import files
from google.colab import drive

from torch import nn
from torch.nn import functional as F
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader
from torch import optim
from torch.nn.modules.loss import CrossEntropyLoss

# Others
from IPython.core.debugger import set_trace
from pathlib import Path

# Dataset preprocessing

In [None]:
drive.mount('/content/drive')

In [None]:
zip_path = '/content/drive/MyDrive/CWRUDataset.zip'
extract_dir = 'extracted_mat_files/'

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)


In [None]:
!ls '/content/extracted_mat_files/CWRU Dataset/Data/12k_FE'
!ls '/content/extracted_mat_files/CWRU Dataset/Data/48k_DE'
!ls '/content/extracted_mat_files/CWRU Dataset/Data/12k_DE'
!ls '/content/extracted_mat_files/CWRU Dataset/Data/Normal'



In [None]:
mat_data = loadmat('/content/extracted_mat_files/CWRU Dataset/Data/48k_DE/B007_0.mat')
mat_data

In [None]:
def extract_data_from_mat(file_path, key=None):
    mat_data = loadmat(file_path)
    # Remove metadata keys that start with "__"
    mat_keys = [k for k in mat_data.keys() if not k.startswith("__")]

    if key is None:
        # Pick first array-like key
        key = mat_keys[0]

    data = mat_data[key]

    if data.ndim == 1:
        data = data.reshape(-1, 1)
    else:
        data = data[:, 0].reshape(-1, 1)
        data = data.reshape(-1, 1)


    if os.path.basename(file_path).startswith('B'):
      label=('Ball')
    elif os.path.basename(file_path).startswith('IR'):
      label=('Inner race')
    elif os.path.basename(file_path).startswith('OR'):
      label=('Outer race')
    elif os.path.basename(file_path).startswith('Normal'):
      label=('Normal')



    return data, label


In [None]:
def load_folder(folder_path, key=None):
    all_data = []
    labels=[]
    if os.path.basename(folder_path).startswith('12'):
      freq=12
    else:
      freq=48
    for filename in os.listdir(folder_path):
        if filename.endswith(".mat"):
            file_path = os.path.join(folder_path, filename)
            data,label = extract_data_from_mat(file_path, key=key)
            all_data.append(data)
            labels.append(label)

    return all_data,labels,freq


In [None]:
file_paths = ['/content/extracted_mat_files/CWRU Dataset/Data/48k_DE/', '/content/extracted_mat_files/CWRU Dataset/Data/12k_DE/', '/content/extracted_mat_files/CWRU Dataset/Data/12k_FE/','/content/extracted_mat_files/CWRU Dataset/Data/Normal/']
all_data = []
all_labels = []
freq = []
for folder in file_paths:
    data,labels,frequence = load_folder(folder)
    if all_data:
      all_data=all_data+data
      all_labels=all_labels+labels
      freq=freq+[frequence] * len(all_labels)

    else:
      all_data=data
      all_labels=labels
      freq=[frequence] * len(all_labels)


In [None]:
print(all_data[0:5], len(all_labels))

Divide in windows of 2048 pts

In [None]:
max_length=2048
step=512
input_chunk=[]
target_chunk=[]
final_freq=[]
for i in range(0,len(all_labels)):
  for j in range(0,len(all_data[i])-max_length,step):
    if len(all_data[i][j:j+max_length])>0:
      input_chunk.append(all_data[i][j:j+max_length])
      target_chunk.append(all_labels[i])
      final_freq.append(freq[i])

In [None]:
print(len(input_chunk[0]))

In [None]:
print(len(target_chunk))

In [None]:
input_chunk=np.array(input_chunk)
final_freq=np.array(final_freq)

In [None]:
def calculate_features(x, fs):

    N = len(x)
    # Fourier transform
    fft_vals = np.fft.fft(x)
    fft_freq = np.fft.fftfreq(N, 1/fs)
    # Use only the positive frequency components since symmetric
    K = N // 2
    s = np.abs(fft_vals[:K])
    f = fft_freq[:K]

    # Time-Domain Features
    p1 = np.mean(x)
    p2 = np.std(x, ddof=1) # ddof=1 for sample standard deviation
    p3 = (np.mean(np.sqrt(np.abs(x))))**2
    p4 = np.mean(np.abs(x))
    p5 = np.max(np.abs(x))
    p6 = np.mean(x**3)
    p7 = np.mean(x**4)
    p8 = np.mean(x**2)
    p9 = p7 / np.abs(p6) #WEIRD
    p10 = p5 / p2
    p11 = p2 / p4
    p12 = p5 / p4
    p13 = np.mean(s)
    p14 = np.var(s, ddof=1)
    p15 = (1 / (K * p14**(3/2))) * np.sum((s - p13)**3)
    p16 = (1 / (K * p14**2)) * np.sum((s - p13)**4)
    p17 = np.sum(f * s) / np.sum(s)
    p18 = np.sqrt(np.sum(((f - p17)**2) * s) / K*np.sum(s))
    p19 = np.sqrt(np.sum((f**2) * s) / np.sum(s))
    p20 = np.sqrt(np.sum(f**4 * s) / np.sum(f**2 * s))
    p21 = np.sum(f**2 * s) / np.sqrt(np.sum(s) / np.sum(f**4 * s))
    p22 = p18 / p17
    p23 = np.sum((f - p17)**3 * s) / (K * p18**3)
    p24 = np.sum((f - p17)**4 * s) / (K * p18**4)

    feat=[p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,p17,p18,p19,p20,p21,p22,p23,p24]
    return feat

In [None]:
chunk=2500
Features1=[]
for i in range(0,len(input_chunk)):
  Features1.append(calculate_features(input_chunk[i],final_freq[i]))
  #print(i)

In [None]:
len(Features1)

In [None]:
def prompting(features=Features1):
  t='time'
  f='frequency'
  feat=[]
  for feature in features:
    prompt0= "You are a bearing fault diagnosis expert. Based on the following features, you need to conduct fault diagnosis:"
    prompt1=f' The mean value of the vibration signal in the {t} domain is {feature[0]}.'
    prompt2=f' The standard deviation of the vibration signal in the {t} domain is {feature[1]}.'
    prompt3=f' The square root amplitude of the vibration signal in the {t} domain is {feature[2]}.'
    prompt4=f' The absolute mean value of the vibration signal in the {t} domain is {feature[3]}.'
    prompt5=f' The peak value of the vibration signal in the {t} domain is {feature[4]}.'
    prompt6=f' The skewness of the vibration signal in the {t} domain is {feature[5]}.'
    prompt7=f' The kurtosis of the vibration signal in the {t} domain is {feature[6]}.'
    prompt8=f' The variance of the vibration signal in the {t} domain is {feature[7]}.'
    prompt9=f' The kurtosis index of the vibration signal in the {t} domain is {feature[8]}.'
    prompt10=f' The peak index of the vibration signal in the {t} domain is {feature[9]}.'
    prompt11=f' The waveform index of the vibration signal in the {t} domain is {feature[10]}.'
    prompt12=f' The pulse index of the vibration signal in the {t} domain is {feature[11]}.'
    prompt13=f' The frequency mean value of the vibration signal in the {f} domain is {feature[12]}.'
    prompt14=f' The frequency variance of the vibration signal in the {f} domain is {feature[13]}.'
    prompt15=f' The frequency skewness of the vibration signal in the {f} domain is {feature[14]}.'
    prompt16=f' The frequency kurtosis of the vibration signal in the {f} domain is {feature[15]}.'
    prompt17=f' The gravity frequency of the vibration signal in the {f} domain is {feature[16]}.'
    prompt18=f' The frequency standard deviation of the vibration signal in the {f} domain is {feature[17]}.'
    prompt19=f' The frequency root mean square of the vibration signal in the {f} domain is {feature[18]}.'
    prompt20=f' The average frequency of the vibration signal in the {f} domain is {feature[19]}.'
    prompt21=f' The regularity degree of the vibration signal in the {f} domain is {feature[20]}.'
    prompt22=f' The variation parameter of the vibration signal in the {f} domain is {feature[21]}.'
    prompt23=f' The eighth-order moment of the vibration signal in the {f} domain is {feature[22]}.'
    prompt24=f' The sixteenth order moment of the vibration signal in the {f} domain is {feature[23]}.'

    feat.append(prompt0+prompt1+prompt2+prompt3+prompt4+prompt5+prompt6+prompt7+prompt8+prompt9+prompt10+prompt11+prompt12+prompt13+prompt14+prompt15+prompt16+prompt17+prompt18+prompt19+prompt20+prompt21+prompt22+prompt23+prompt24)
  return feat


In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

numeric_labels = encoder.fit_transform(target_chunk)

print(f"Classes (mapping): {encoder.classes_}")

In [None]:
X=prompting()
y=numeric_labels

In [None]:
print(len(X),len(y))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,    # 20% for testing
    random_state=42,
    stratify=y
)

In [None]:
import torch
torch.cuda.empty_cache()


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "THUDM/chatglm2-6b"
num_classes = 4

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes,
    trust_remote_code=True
)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)


class CustomDataset(Dataset):
    def __init__(self, features, labels, tokenizer=tokenizer, max_length=2048):
        assert len(features) == len(labels)
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.features[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors=None
          )
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']


        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

batch_size = 32

# Training DataLoader
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,  # Shuffles data for every epoch
    drop_last=True,
    collate_fn=data_collator
)

# Testing DataLoader
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator
)


In [None]:
def feature_to_text(features):
    return " ".join(map(str, features))

train_texts = [feature_to_text(f) for f in X_train]
test_texts = [feature_to_text(f) for f in X_test]

train_dataset = CustomDataset(train_texts, y_train)
test_dataset = CustomDataset(test_texts, y_test)


In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,  # Rank of the update matrices
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.1,
)

peft_model = get_peft_model(model, lora_config)

peft_model.print_trainable_parameters()

In [None]:
import torch.nn as nn
#Reinitialization because there were NaN in the classifier head
for name, module in peft_model.named_modules():
    if "classifier_head" in name:
        def _init_weights(m):
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, mean=0.0, std=0.02)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

        module.apply(_init_weights)
        print("Reinitialization complete.")

In [None]:
import torch

print("\nVerifying weights are now valid...")
# Check if still NaN parameters in the classifier_head
for n, p in peft_model.named_parameters():
    if "classifier_head" in n:
        has_nan = torch.isnan(p).any().item()
        print(f"Parameter '{n}' has NaN: {has_nan}")

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/results",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    learning_rate=5e-6,
    logging_steps=10,                # How often to log training progress
    save_steps=50,                   # How often to save a checkpoint
    save_total_limit=3,
    remove_unused_columns=False,
    fp16=True)

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # The model outputs logits, so we take the argmax to get the predicted class
    preds = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [None]:
batch = next(iter(train_loader))
print(batch['input_ids'].shape)   # [batch_size, seq_len]
print(batch['labels'].shape)      # [batch_size]


In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train(resume_from_checkpoint=True)


In [None]:
peft_model.save_pretrained("/content/drive/MyDrive/lora-adapters")

In [None]:
evaluation_results = trainer.evaluate()

print(evaluation_results)