# NLP - DL

In [62]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('.')

import os
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from torch.utils.data import DataLoader, TensorDataset
from torchviz import make_dot
import torch
from common.db import Neo4jDB
from lib.dl_utils import CustomDataset, compute_metrics

NEO4J_CONFIG = 'graph_db_ee'
NEO4J_DB='calllog'

neo4jdb = Neo4jDB(config_name=NEO4J_CONFIG, database=NEO4J_DB)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [33]:
cypher1 = '''MATCH (dr:DispatchRaw)<--(d:Dispatch)-->(f:Failure)
WHERE dr.classify3='(bootup)no power'
RETURN dr.contact_title AS contact_title,
       dr.contact_text AS contact_text,
       dr.classify3 AS classify3
LIMIT 300
'''
data1 = neo4jdb.run_cypher(cypher1)
data1.drop_duplicates(subset='contact_text', inplace=True)

cypher2 = '''MATCH (dr:DispatchRaw)<--(d:Dispatch)-->(f:Failure)
WHERE dr.classify3='(bsod)bsod'
RETURN dr.contact_title AS contact_title,
       dr.contact_text AS contact_text,
       dr.classify3 AS classify3
LIMIT 300
'''
data2 = neo4jdb.run_cypher(cypher2)
data2.drop_duplicates(subset='contact_text', inplace=True)

data = pd.concat([data1.sample(70), data2.sample(70)])
data.reset_index(drop=True, inplace=True)


### Load BERT Model

In [69]:
model_path = '/mnt/hdd1/Model/calllog_llm/transformer/roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_path)
base_model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=2)            
lora_config = LoraConfig(r = 32, lora_alpha = 64, lora_dropout = 0.05, task_type = 'SEQ_CLS')    
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /mnt/hdd1/Model/calllog_llm/transformer/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,771,778 || all params: 126,418,948 || trainable%: 1.4015130073697497


In [4]:
#Optional - Plot the model
dummy_input = torch.zeros((1, 512), dtype=torch.long)  # Adjust the sequence length as needed
logits = model(dummy_input).logits
graph = make_dot(logits, params=dict(model.named_parameters()))
graph.render(filename='roberta_model', format='png', cleanup=True)

'roberta_model.png'

### Training Data Preprocess

In [34]:
data['text'] = data.apply(lambda r: '. '.join([r['contact_title'], r['contact_text']]), axis=1)
labelEncoder = LabelEncoder()
data['label'] = labelEncoder.fit_transform(data['classify3']) 
data['encoding'] = data['text'].map(lambda x: tokenizer(x)['input_ids'])

In [39]:
data_tr, data_ts = train_test_split(data, test_size=20, random_state=8, stratify=data['label'])
data_tr, data_val = train_test_split(data_tr, test_size=20, random_state=8, stratify=data_tr['label'])

print(f'training data size:{len(data_tr)}')
print(f'validation data size:{len(data_val)}')
print(f'test data size:{len(data_ts)}')

train_dataset = CustomDataset(dataframe=data_tr, tokenizer=tokenizer)
eval_dataset = CustomDataset(dataframe=data_val, tokenizer=tokenizer)
test_dataset = CustomDataset(dataframe=data_ts, tokenizer=tokenizer)



training data size:100
validation data size:20
test data size:20


### Roberta Model training

In [70]:
#Training
output_dir = 'tmp'
training_args = TrainingArguments(output_dir=output_dir, 
                                  learning_rate = 1e-4,
                                  weight_decay = 0.01,
                                  per_device_train_batch_size=16, 
                                  per_device_eval_batch_size = 4,                                  
                                  evaluation_strategy="epoch", 
                                  save_strategy = 'epoch',
                                  load_best_model_at_end = True,
                                  num_train_epochs=10, 
                                  report_to=[])

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Acc,F1 Micro,F1 Macro,F1 Weighted
1,No log,0.69403,0.5,0.5,0.333333,0.333333
2,No log,0.691706,0.5,0.5,0.333333,0.333333
3,No log,0.689056,0.5,0.5,0.479167,0.479167
4,No log,0.686326,0.55,0.55,0.539642,0.539642
5,No log,0.683863,0.7,0.7,0.67033,0.67033
6,No log,0.681301,0.55,0.55,0.435737,0.435737
7,No log,0.678197,0.65,0.65,0.60114,0.60114
8,No log,0.675897,0.75,0.75,0.733333,0.733333
9,No log,0.674886,0.75,0.75,0.733333,0.733333
10,No log,0.67448,0.75,0.75,0.733333,0.733333


TrainOutput(global_step=70, training_loss=0.6883468082972936, metrics={'train_runtime': 44.2024, 'train_samples_per_second': 22.623, 'train_steps_per_second': 1.584, 'total_flos': 268553957376000.0, 'train_loss': 0.6883468082972936, 'epoch': 10.0})

### Model Performance Testing

In [71]:
predictions = trainer.predict(test_dataset)
list(predictions)[2]

{'test_loss': 0.677832305431366,
 'test_acc': 0.75,
 'test_f1_micro': 0.75,
 'test_f1_macro': 0.7493734335839599,
 'test_f1_weighted': 0.7493734335839598,
 'test_runtime': 0.338,
 'test_samples_per_second': 59.167,
 'test_steps_per_second': 14.792}

In [83]:
data_ts['predict']=np.argmax(predictions[0], axis=1)

In [92]:
data_ts['predict_classify'] = data_ts['predict'].map(lambda x: labelEncoder.classes_[x])

In [93]:
pd.set_option("display.max_colwidth", None)
data_ts[data_ts.apply(lambda r: r['label']!=r['predict'], axis=1)][['text','classify3','predict_classify']]

Unnamed: 0,text,classify3,predict_classify
8,Exg Collab Case Number: # 179294725. CMG call transfer to pro support PLUS cx is looking for tech support,(bootup)no power,(bsod)bsod
136,Dell Online MailInRepairRequest : Online. --Call Text Unavailable--,(bsod)bsod,(bootup)no power
86,"BIOS Setting JSRX9Y3. Operating System Errors and Blue Screen_x000D_\nNext Activity Added to this Call_Text!! 00T6P00000qw0fFUAQ _x000D_\nAdditional ToXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0D_CC: _x000D_BCC: _x000D_Attachment: _x000D__x000D_Subject: Dell Support: Case Number # 176775048 from Jacob [ ref:_00D0bGaMp._5006PEZMh5:ref ]_x000D_Body:_x000D_Case Number 176775048_x000D__x000D_Contact Us | Support Library | Download Center | Support Assist | Community Forums_x000D__x000D_** Please Use Reply to All when replying to this emailXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx000D__x000D_This emailXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXraction with Dell Technical Support._x000D__x000D_I have included your service request information below._x000D__x000D_Service Tag: JSRX9Y3_x000D_Case Number: 176775048_x000D__x000D_I will be your service request owner and primary point of contact until your Dell issue is completely resolved. If you require additional assistance while Im out of the office please refer to my emailXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXn. You may also contact me directly by replying all to this emailXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXthe standard warranty support options, the new AR Assistant app is available to help you service your system. For information about the AR Assistant, here is a brief video overview and the link to download the app is below._x000D__x000D_Now available on select Alienware, Chromebook, G3, Inspiron, Latitude, OptiPlex, Precision, Vostro and XPS systems._x000D__x000D_Dell AR Assistant for Android_x000D__x000D_Dell AR Assistant for iOS_x000D__x000D_Thank you for choosing Dell._x000D__x000D_Please do not change the subject line of the emailXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX0D_Senior Technician, Technical Support_x000D_Dell Technologies | Pro Support_x000D_Jacob.Jetty@Dell.com_x000D_Working Hours: Monday ‒ Friday | 10:30 ‒ 21:30 (CT)_x000D__x000D_My manager is robyn_seippel@dell.com Thanks!_x000D__x000D_ref:_00D0bGaMp._5006PEZMh5:ref",(bsod)bsod,(bootup)no power
10,"No Power. 699 Boulevard Cardinal Leger PINCOURT, QC J7W 6W9_x000D__x000D_No Power_x000D_Hard Reset - same issue_x000D_Power Button is ok_x000D_M-Bist - no light_x000D_Known Good Ac/AD - same issue_x000D_Cst ask for exchange_x000D_Cst is eligible for exchange - <30 days purchase_x000D__x000D_Luc Jellet_x000D_INFO-IT@VILLEPINCOURT.QC.CA",(bootup)no power,(bsod)bsod
51,"No Power. 문제 설명: No Power_x000D_문제 유형: Hardware_x000D_서비스 유형: Parts & Labor_x000D_부품 배송 중: MB_x000D_부품 교체 지침: Replace Parts as Needed_x000D_Resolution Type: Dispatch_x000D__x000D__x000D_Notes (Troubleshooting): - 전원 LED = OFF_x000D_- 어댑터 LED = White Solid_x000D_- 어댑터 SWAP = 증상 동일_x000D_- 진단등 LED = OFF_x000D_- 지난 주 금요일 새제품 오픈 후, 금일 재부팅 시 No Power 발생_x000D_- 잔류전원 제거 = N/A : NCTT_x000D_- RTC 리셋 = 증상 동일_x000D_- MB 접수 안내_x000D__x000D_Resolution Type: Dispatch_x000D__x000D__x000D_N/A_x000D_N/A_x000D_Dell Smart ID 확인 방법 (DSID): PPID / Tag_x000D_Requested Follow-Up Type(s): 이메일, 전화_x000D_잠재적 데이터 손실 알림: No_x000D_반품 인식_x000D_지원 정책 범위 준수: In Scope_x000D_다른 결함이 발견되지 않은 경우 확인(VNOFF)_x000D_고객 만족도: OK_x000D_#Logger | Case Log | APJ | Client | Logger#",(bootup)no power,(bsod)bsod
