## Import needed dependencies

In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from simpletransformers.classification import MultiLabelClassificationModel

## Import and process dataset

### Input data are avaiable in following location:  
AWS S3 bucket  
Github /data folder  
RDS MySQL database table

In [95]:
df_mct_raw_imp=pd.read_csv('WETG_AI_ML_BUG_AUTO_MCT_Jira_Github_Commit_Dataset.csv' ,
                      usecols=["jira_ticket","filename","repo","summary","description","environment"]
                      ) 

### In case there is any, remove records with missing 'Summary' (non-bug tickets)

In [96]:
df_mct_raw_imp = df_mct_raw_imp[~df_mct_raw_imp['summary'].isnull()]

In [97]:
df_mct_raw_imp["filename_all"] = df_mct_raw_imp["repo"] + '-' + df_mct_raw_imp["filename"]

In [98]:
df_mct_raw_imp["path"] = ''
for iter in range(len(df_mct_raw_imp)):
    df_mct_raw_imp.loc[iter,"path"] = df_mct_raw_imp.loc[iter,"repo"] + "\\" + df_mct_raw_imp.loc[iter,"filename"][0:df_mct_raw_imp.loc[iter,"filename"].rfind("/")]   

In [99]:
df_mct_raw_imp['environment'] = df_mct_raw_imp['environment'].str.upper()

In [100]:
df_mct_raw_imp = df_mct_raw_imp.drop(["repo"],axis=1)

In [101]:
df_mct_raw_imp.shape

(2746, 7)

In [102]:
df_mct_raw_imp.head(2)

Unnamed: 0,jira_ticket,filename,summary,description,environment,filename_all,path
0,MCT-1157,src/pages/SearchResultsPage/PlanCardParts/Head...,"Fix ""Add to Compare"" button state",follow up to MCT-960\r\n\r\nProblem\r\n !image...,TEST,coverage-tools-frontend-src/pages/SearchResult...,coverage-tools-frontend\src/pages/SearchResult...
1,MCT-1157,src/styles/pages/SearchResultsPage/_PlanCard.scss,"Fix ""Add to Compare"" button state",follow up to MCT-960\r\n\r\nProblem\r\n !image...,TEST,coverage-tools-frontend-src/styles/pages/Searc...,coverage-tools-frontend\src/styles/pages/Searc...


In [103]:
df_mct_raw_imp["environment"].value_counts()

PROD    1337
IMP      768
DEV      384
TEST     257
Name: environment, dtype: int64

### Only keep records of the following repositories

In [94]:
# df_mct_raw_imp = df_mct_raw_imp.loc[(df_mct_raw_imp["repo"].isin(['coverage-tools-frontend','coverage-tools-api','operations']))]

### Import Jira ticket assessment data

In [50]:
df_mct_scores=pd.read_csv('WETG_AI_ML_BUG_AUTO_MCT_Jira_Ticket_Assessment.csv', 
                      usecols=["jira_ticket","score"]
                      ) 

In [51]:
df_mct_scores.head(3)

Unnamed: 0,jira_ticket,score
0,MCT-4890,4.0
1,MCT-4889,5.0
2,MCT-4888,6.0


### Choose Jira tickets with assessment score of 3 or higher

In [52]:
df_mct_scores = df_mct_scores[(df_mct_scores["score"] >= 3)]

### Merge with dataset to create eligible analysis data

In [54]:
df_mct_raw = pd.merge(df_mct_raw_imp,df_mct_scores,on='jira_ticket',how='inner').drop(["score"],axis=1)

In [55]:
df_mct_raw.head(2)

Unnamed: 0,jira_ticket,filename,repo,summary,description,environment,filename_all,path
0,MCT-1157,src/pages/SearchResultsPage/PlanCardParts/Head...,coverage-tools-frontend,"Fix ""Add to Compare"" button state",follow up to MCT-960\r\n\r\nProblem\r\n !image...,TEST,coverage-tools-frontend-src/pages/SearchResult...,coverage-tools-frontend\src/pages/SearchResult...
1,MCT-1157,src/styles/pages/SearchResultsPage/_PlanCard.scss,coverage-tools-frontend,"Fix ""Add to Compare"" button state",follow up to MCT-960\r\n\r\nProblem\r\n !image...,TEST,coverage-tools-frontend-src/styles/pages/Searc...,coverage-tools-frontend\src/styles/pages/Searc...


In [56]:
df_mct_raw.value_counts('jira_ticket')

jira_ticket
MCT-6796    60
MCT-3192    50
MCT-4540    36
MCT-2108    30
MCT-2226    28
            ..
MCT-2909     1
MCT-2924     1
MCT-4931     1
MCT-2972     1
MCT-2851     1
Length: 252, dtype: int64

### Assign test classification flags based on the location of files

In [57]:
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/pages/'),'otherpages']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/pages/EnrollmentFormPage/'),'enrollmentformpage']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/pages/ManagePrescriptionsPage/'),'manageprescriptionspage']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/pages/PharmacySelectionPage/'),'pharmacyselectionpage']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/pages/PlanComparisonPage/'),'plancomparisonpage']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/pages/PlanDetailsPage/'),'plandetailspage']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/pages/QuestionRoutingPage/'),'questionroutingpage']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/pages/SearchResultsPage/'),'searchresultspage']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/styles'),'styles']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/components/'),'components']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/app/'),'app']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/types/'),'types']=int(bool(1))
df_mct_raw.loc[df_mct_raw.filename.str.contains('src/translations/'),'translations']=int(bool(1))

### Inspect the first rows of the data file

In [59]:
df_mct_raw.head(3)

Unnamed: 0,jira_ticket,filename,repo,summary,description,environment,filename_all,path,otherpages,enrollmentformpage,...,pharmacyselectionpage,plancomparisonpage,plandetailspage,questionroutingpage,searchresultspage,styles,components,app,types,translations
0,MCT-1157,src/pages/SearchResultsPage/PlanCardParts/Head...,coverage-tools-frontend,"Fix ""Add to Compare"" button state",follow up to MCT-960\r\n\r\nProblem\r\n !image...,TEST,coverage-tools-frontend-src/pages/SearchResult...,coverage-tools-frontend\src/pages/SearchResult...,1.0,,...,,,,,1.0,,,,,
1,MCT-1157,src/styles/pages/SearchResultsPage/_PlanCard.scss,coverage-tools-frontend,"Fix ""Add to Compare"" button state",follow up to MCT-960\r\n\r\nProblem\r\n !image...,TEST,coverage-tools-frontend-src/styles/pages/Searc...,coverage-tools-frontend\src/styles/pages/Searc...,,,...,,,,,,1.0,,,,
2,MCT-1400,src/components/DOBValidator.tsx,coverage-tools-frontend,Missing Continue Button for under age DoB,*Acceptance Criteria*\r\n# show the continue b...,TEST,coverage-tools-frontend-src/components/DOBVali...,coverage-tools-frontend\src/components,,,...,,,,,,,1.0,,,


In [60]:
df_mct_raw.shape

(1420, 21)

### Retain records with at least one flag set to 1

In [61]:
df_training=df_mct_raw.loc[~((df_mct_raw.otherpages.isna())\
             &(df_mct_raw.enrollmentformpage.isna())\
             &(df_mct_raw.manageprescriptionspage.isna())\
             &(df_mct_raw.pharmacyselectionpage.isna())\
             &(df_mct_raw.plancomparisonpage.isna())\
             &(df_mct_raw.plandetailspage.isna())\
             &(df_mct_raw.questionroutingpage.isna())\
             &(df_mct_raw.searchresultspage.isna())\
             &(df_mct_raw.app.isna())\
             &(df_mct_raw.styles.isna())\
             &(df_mct_raw.components.isna())\
             &(df_mct_raw.translations.isna())\
             &(df_mct_raw.types.isna()))]

In [62]:
df_training.shape

(831, 21)

In [63]:
df_training['otherpages'].value_counts()

1.0    510
Name: otherpages, dtype: int64

### Set the flag of the catch-all class 'otherpages'

In [65]:
df_training.loc[~((df_training.enrollmentformpage.isna())\
             &(df_training.manageprescriptionspage.isna())\
             &(df_training.pharmacyselectionpage.isna())\
             &(df_training.plancomparisonpage.isna())\
             &(df_training.plandetailspage.isna())\
             &(df_training.questionroutingpage.isna())\
             &(df_training.searchresultspage.isna())),'otherpages'] = np.nan

In [66]:
df_training.shape

(831, 21)

In [72]:
# df_training.head(3)

### Create the dataframe of Jira tickets

In [73]:
df_training_grp = df_training.loc[:,['jira_ticket', 'summary', 'description','environment',
                                 'otherpages','enrollmentformpage',
                                 'manageprescriptionspage', 'pharmacyselectionpage', 'plancomparisonpage', 
                                 'plandetailspage', 'questionroutingpage', 'searchresultspage', 
                                 'styles','components', 'app', 'translations', 'types'
                                ]].groupby(['jira_ticket','summary','description','environment'],).max().reset_index().fillna(0)

In [74]:
df_training_grp.shape

(198, 17)

In [77]:
df_training_grp.head(3)

Unnamed: 0,jira_ticket,summary,description,environment,otherpages,enrollmentformpage,manageprescriptionspage,pharmacyselectionpage,plancomparisonpage,plandetailspage,questionroutingpage,searchresultspage,styles,components,app,translations,types,text,labels
0,MCT-1157,"Fix ""Add to Compare"" button state",follow up to MCT-960\r\n\r\nProblem\r\n !image...,TEST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,"Fix ""Add to Compare"" button statefollow up to ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
1,MCT-1400,Missing Continue Button for under age DoB,*Acceptance Criteria*\r\n# show the continue b...,TEST,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Missing Continue Button for under age DoB*Acce...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,MCT-1412,Apply form validation on Date of Birth Entry,Acceptance Criteria\r\n * Implement the DOB va...,PROD,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,Apply form validation on Date of Birth EntryAc...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


### Create column of 'text' which combines 'Summary', 'Description', and 'Environment'

In [75]:
df_training_grp['text']=df_training_grp['summary'].apply(lambda x: x.replace('\n', ' '))+df_training_grp['description'].apply(lambda x: x.replace('\n', ' '))+' Environment: '+df_training_grp['environment']

### Create column of 'labels'

In [76]:
df_training_grp['labels'] = list(zip(df_training_grp.otherpages.tolist(), df_training_grp.enrollmentformpage.tolist(), df_training_grp.manageprescriptionspage.tolist(), df_training_grp.pharmacyselectionpage.tolist(), df_training_grp.plancomparisonpage.tolist(), df_training_grp.plandetailspage.tolist(), df_training_grp.questionroutingpage.tolist(), df_training_grp.searchresultspage.tolist(), df_training_grp.app.tolist(), df_training_grp.styles.tolist(), df_training_grp.components.tolist(), df_training_grp.translations.tolist(), df_training_grp.types.tolist()))

## Split Training and Testing datasets

In [78]:
df_training_grp = df_training_grp.drop(['summary','description','environment'],axis=1)

In [79]:
train_df, eval_df = train_test_split(df_training_grp, test_size=0.2)

In [81]:
print("Number of training Jira tickets:", len(train_df))

Number of training Jira tickets: 158


In [82]:
print("Number of testing Jira tickets:", len(eval_df))

Number of testing Jira tickets: 40


## ML Data Model

### Define model with parameters

In [148]:
model = MultiLabelClassificationModel('roberta', 
                                      'roberta-base', 
                                      num_labels=13, 
                                      use_cuda= False,
                                      args={'train_batch_size':2, 
                                            'gradient_accumulation_steps':16, 
                                            'learning_rate': 3e-5, 
                                            'num_train_epochs': 3, 
                                            'max_seq_length': 512,
                                            'overwrite_output_dir': True
                                           })


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultiLabelSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'clas

### Train the model

In [149]:
model.train_model(train_df)

  0%|          | 0/158 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/79 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/79 [00:00<?, ?it/s]

(12, 0.730195924329261)

### Predict

In [150]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

  0%|          | 0/40 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/5 [00:00<?, ?it/s]

### Print the result

In [152]:
print(result)

{'LRAP': 0.5951749847374849, 'eval_loss': 0.47983573079109193}


In [154]:
df_output.shape

(40, 13)

In [155]:
df_output.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.442523,0.228741,0.399201,0.229523,0.329921,0.299142,0.324829,0.288819,0.32521,0.342028,0.416767,0.311485,0.251377
1,0.436161,0.237826,0.405997,0.246122,0.329917,0.293126,0.337661,0.295573,0.353015,0.356084,0.420489,0.314255,0.256113
2,0.437983,0.231955,0.39986,0.236711,0.326625,0.305343,0.315295,0.314519,0.335167,0.350305,0.41122,0.317493,0.261053
3,0.442591,0.223254,0.398219,0.229002,0.339429,0.302641,0.311994,0.300681,0.31675,0.350883,0.418444,0.314501,0.258385
4,0.437889,0.230639,0.407529,0.240378,0.324621,0.300937,0.310884,0.297628,0.319554,0.336728,0.411755,0.3125,0.2624


In [156]:
print(model_outputs)

[[0.44252273 0.22874144 0.39920118 0.22952303 0.3299205  0.29914239
  0.32482901 0.28881904 0.32520977 0.34202823 0.416767   0.31148538
  0.25137681]
 [0.43616074 0.23782632 0.40599662 0.24612218 0.32991663 0.29312578
  0.33766127 0.29557261 0.35301483 0.35608357 0.42048869 0.31425533
  0.25611338]
 [0.43798292 0.23195538 0.39986011 0.23671065 0.32662466 0.30534318
  0.31529531 0.31451946 0.33516687 0.35030547 0.41122031 0.31749281
  0.26105258]
 [0.44259095 0.223254   0.39821947 0.22900222 0.33942878 0.30264056
  0.31199431 0.30068138 0.31675002 0.35088331 0.4184435  0.31450117
  0.25838476]
 [0.43788871 0.2306388  0.40752941 0.24037832 0.32462108 0.30093721
  0.31088442 0.29762846 0.31955364 0.33672833 0.41175511 0.3124997
  0.26240033]
 [0.43472815 0.22925316 0.40855953 0.24765542 0.32714573 0.30330795
  0.32001552 0.29683894 0.32945323 0.34241799 0.41217744 0.30976257
  0.26502293]
 [0.44065064 0.2306     0.40413314 0.23950642 0.32009551 0.31356391
  0.32930526 0.30847546 0.3379110