# Preparing data for NN
The software metrics need to be connected to the issue tags per commit via identificators.

In [1]:
import pandas as pd
import numpy as np
import os
import ast
from collections import Counter

In [2]:
# data import
current_dir = os.getcwd()

# construct path to the project data folder
data_dir = os.path.join(current_dir, '..', '..', 'Data','Sonar_Measures')

# load SonarQube measure data
dfm = pd.read_csv(os.path.join(data_dir, 'sonar_measures_v1_v2_no_statics.csv'), low_memory=False)

data_dir = os.path.join(current_dir, '..', '..', 'Data','Sonar_Issues')
dfi1 = pd.read_csv(os.path.join(data_dir, 'tags_commithash_v1.csv'))
dfi2 = pd.read_csv(os.path.join(data_dir, 'tags_analysiskey_v2.csv'))

In [3]:
dfm

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,QUALITY_PROFILES,NEW_SQALE_DEBT_RATIO,VULNERABILITIES,RELIABILITY_REMEDIATION_EFFORT,RELIABILITY_RATING,SECURITY_REMEDIATION_EFFORT,SECURITY_RATING,database,ANALYSIS_KEY,DIRECTORIES
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,...,"[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.000000,838,7322,5,9505,4,Version1,,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,"[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.222222,838,7081,5,9505,4,Version1,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,"[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.222222,838,7081,5,9505,4,Version1,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,"[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.674560,838,7322,5,9505,4,Version1,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,"[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.671668,838,7322,5,9505,4,Version1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,,vfs,2002-08-20 06:10:50,69.0,65.0,425.0,1536.0,24.2,690.0,10.6,...,"[{""key"":""java-sonar-way-04122"";""language"":""jav...",2.025316,1,0,1,30,4,Version2,AV4Z-nYtG0XxMwG_Vh5b,14.0
140744,,vfs,2002-08-20 02:57:02,69.0,65.0,422.0,1533.0,24.1,693.0,10.7,...,"[{""key"":""java-sonar-way-04122"";""language"":""jav...",2.184236,1,0,1,30,4,Version2,AV4Z-jElG0XxMwG_Vh5F,14.0
140745,,vfs,2002-07-19 11:54:15,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,"[{""key"":""java-sonar-way-04122"";""language"":""jav...",2.613333,1,0,1,30,4,Version2,AV4Z-fA0G0XxMwG_Vh47,14.0
140746,,vfs,2002-07-18 16:47:24,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,"[{""key"":""java-sonar-way-04122"";""language"":""jav...",2.613333,1,0,1,30,4,Version2,AV4Z-bDZG0XxMwG_Vh4x,14.0


## Fixing project names
Currently, apache commons names have a different naming scheme over the two database versions. They need to be fixed by deleting 'commons-' in the project IDs.

In [4]:
dfm['PROJECT_ID'].unique()

array(['accumulo', 'ambari', 'atlas', 'aurora', 'batik', 'beam', 'cocoon',
       'commons-bcel', 'commons-beanutils', 'commons-cli',
       'commons-codec', 'commons-collections', 'commons-configuration',
       'commons-daemon', 'commons-dbcp', 'commons-dbutils',
       'commons-digester', 'commons-exec', 'commons-fileupload',
       'commons-io', 'commons-jelly', 'commons-jexl', 'commons-jxpath',
       'commons-net', 'commons-ognl', 'commons-validator', 'commons-vfs',
       'felix', 'httpcomponents-client', 'httpcomponents-core',
       'mina-sshd', 'santuario', 'zookeeper', 'cayenne', 'archiva',
       'bcel', 'beanutils', 'codec', 'collections', 'configuration',
       'daemon', 'dbcp', 'dbutils', 'digester', 'hive', 'httpclient',
       'httpcore', 'jxpath', 'net', 'ognl', 'thrift', 'validator', 'vfs'],
      dtype=object)

In [5]:
# How many unique projects are in the data? - 39.
dfm['PROJECT_ID'] = dfm['PROJECT_ID'].str.replace('commons-', '')
len(dfm['PROJECT_ID'].unique())

39

In [6]:
dfm['database'].unique()

array(['Version1', 'Version2'], dtype=object)

In [7]:
# How many unique projects are there for database version 1? - 33.
len(dfm[dfm['database'] == 'Version1']['PROJECT_ID'].unique())

33

In [8]:
# How many unique projects are there for database version 2? - 39.
len(dfm[dfm['database'] == 'Version2']['PROJECT_ID'].unique())

30

In [9]:
dfi1 = dfi1[['creationCommitHash', 'uniqueTags']]
dfi1.columns =  ['creationCommitHash', 'TAGS']
dfi1

Unnamed: 0,creationCommitHash,TAGS
0,0001f90914b418859eb9fa86903e89a793e48e9b,"['error-handling', 'cwe', 'design', 'cert', 'c..."
1,0010fcad01ce4ac5051a61bec349bd6ac397c994,['brain-overload']
2,001e87db6432867d72f59553359d0e4cee5cd31d,['convention']
3,002e4de0d68ffab9af339283f02659472a1febe7,"['error-handling', 'cwe', 'cert', 'design']"
4,002f40f9d3ec65891b5faf086df404dd3c450600,"['error-handling', 'brain-overload', 'cwe', 'd..."
...,...,...
9437,ffb2c376f08ea23854f4fad4933005b34827d5a7,"['brain-overload', 'misra', 'bad-practice', 'u..."
9438,ffcdbb17689ed325715a43dc407ca20348277a5a,['clumsy']
9439,ffd4863413cdb91db6f21744ada1d0b4a1559626,"['error-handling', 'cwe', 'convention', 'cert'..."
9440,ffd8e16a8a418dce679e07a973d6ff9bb5b304aa,"['error-handling', 'brain-overload', 'clumsy',..."


In [10]:
# transform tags to strings
dfi1['TAGS'] = dfi1['TAGS'].apply(ast.literal_eval)
dfi1['TAGS'] = dfi1['TAGS'].apply(lambda x: ', '.join(x))
dfi1

Unnamed: 0,creationCommitHash,TAGS
0,0001f90914b418859eb9fa86903e89a793e48e9b,"error-handling, cwe, design, cert, convention"
1,0010fcad01ce4ac5051a61bec349bd6ac397c994,brain-overload
2,001e87db6432867d72f59553359d0e4cee5cd31d,convention
3,002e4de0d68ffab9af339283f02659472a1febe7,"error-handling, cwe, cert, design"
4,002f40f9d3ec65891b5faf086df404dd3c450600,"error-handling, brain-overload, cwe, design, c..."
...,...,...
9437,ffb2c376f08ea23854f4fad4933005b34827d5a7,"brain-overload, misra, bad-practice, unused, cert"
9438,ffcdbb17689ed325715a43dc407ca20348277a5a,clumsy
9439,ffd4863413cdb91db6f21744ada1d0b4a1559626,"error-handling, cwe, convention, cert, suspicious"
9440,ffd8e16a8a418dce679e07a973d6ff9bb5b304aa,"error-handling, brain-overload, clumsy, cwe, d..."


In [11]:
dfi2['UNIQUE_TAGS'].iloc[0]

"['suspicious', 'error-handling', 'cert', 'bad-practice']"

In [12]:
dfi2 = dfi2[['CREATION_ANALYSIS_KEY', 'UNIQUE_TAGS']]
dfi2

Unnamed: 0,CREATION_ANALYSIS_KEY,UNIQUE_TAGS
0,AV0-6-3jt6tne_r58pUF,"['suspicious', 'error-handling', 'cert', 'bad-..."
1,AV0-8Tovt6tne_r58pUY,['clumsy']
2,AV0-9p72t6tne_r58pUo,['bad-practice']
3,AV0-BQ2Et6tne_r58pCR,['design']
4,AV0-LoN2t6tne_r58pFZ,"['pitfall', 'cert']"
...,...,...
9688,AWentw92Z1vVHhGU4u0K,"['error-handling', 'confusing', 'bad-practice'..."
9689,AWenw12KZ1vVHhGU4vEw,"['convention', 'pitfall']"
9690,AWenwUBsZ1vVHhGU4vAM,"['convention', 'brain-overload', 'cert', 'bad-..."
9691,AWenyFZ8Z1vVHhGU4vLr,"['error-handling', 'cwe', 'cert', 'bad-practice']"


In [13]:
dfi2['TAGS'] = dfi2['UNIQUE_TAGS'].apply(ast.literal_eval)
dfi2 = dfi2.drop(columns = 'UNIQUE_TAGS')
dfi2['TAGS'] = dfi2['TAGS'].apply(lambda x: ', '.join(x))
dfi2

Unnamed: 0,CREATION_ANALYSIS_KEY,TAGS
0,AV0-6-3jt6tne_r58pUF,"suspicious, error-handling, cert, bad-practice"
1,AV0-8Tovt6tne_r58pUY,clumsy
2,AV0-9p72t6tne_r58pUo,bad-practice
3,AV0-BQ2Et6tne_r58pCR,design
4,AV0-LoN2t6tne_r58pFZ,"pitfall, cert"
...,...,...
9688,AWentw92Z1vVHhGU4u0K,"error-handling, confusing, bad-practice, cert,..."
9689,AWenw12KZ1vVHhGU4vEw,"convention, pitfall"
9690,AWenwUBsZ1vVHhGU4vAM,"convention, brain-overload, cert, bad-practice"
9691,AWenyFZ8Z1vVHhGU4vLr,"error-handling, cwe, cert, bad-practice"


In [14]:
dfm_i = dfm.merge(dfi1, how = 'left', left_on = 'COMMIT_HASH', right_on = 'creationCommitHash')
dfm_i

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,VULNERABILITIES,RELIABILITY_REMEDIATION_EFFORT,RELIABILITY_RATING,SECURITY_REMEDIATION_EFFORT,SECURITY_RATING,database,ANALYSIS_KEY,DIRECTORIES,creationCommitHash,TAGS
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,...,838,7322,5,9505,4,Version1,,,e0880e263e4bf8662ba3848405200473a25dfc9f,"cwe, bad-practice, unused, cert, suspicious, j..."
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,838,7081,5,9505,4,Version1,,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,838,7081,5,9505,4,Version1,,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,838,7322,5,9505,4,Version1,,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,838,7322,5,9505,4,Version1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,,vfs,2002-08-20 06:10:50,69.0,65.0,425.0,1536.0,24.2,690.0,10.6,...,1,0,1,30,4,Version2,AV4Z-nYtG0XxMwG_Vh5b,14.0,,
140744,,vfs,2002-08-20 02:57:02,69.0,65.0,422.0,1533.0,24.1,693.0,10.7,...,1,0,1,30,4,Version2,AV4Z-jElG0XxMwG_Vh5F,14.0,,
140745,,vfs,2002-07-19 11:54:15,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,1,0,1,30,4,Version2,AV4Z-fA0G0XxMwG_Vh47,14.0,,
140746,,vfs,2002-07-18 16:47:24,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,1,0,1,30,4,Version2,AV4Z-bDZG0XxMwG_Vh4x,14.0,,


In [15]:
dfm_i = dfm_i.merge(dfi2, how = 'left', left_on = 'ANALYSIS_KEY', right_on = 'CREATION_ANALYSIS_KEY')
dfm_i

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,RELIABILITY_RATING,SECURITY_REMEDIATION_EFFORT,SECURITY_RATING,database,ANALYSIS_KEY,DIRECTORIES,creationCommitHash,TAGS_x,CREATION_ANALYSIS_KEY,TAGS_y
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,...,5,9505,4,Version1,,,e0880e263e4bf8662ba3848405200473a25dfc9f,"cwe, bad-practice, unused, cert, suspicious, j...",,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,5,9505,4,Version1,,,,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,5,9505,4,Version1,,,,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,5,9505,4,Version1,,,,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,5,9505,4,Version1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,,vfs,2002-08-20 06:10:50,69.0,65.0,425.0,1536.0,24.2,690.0,10.6,...,1,30,4,Version2,AV4Z-nYtG0XxMwG_Vh5b,14.0,,,AV4Z-nYtG0XxMwG_Vh5b,"suspicious, error-handling, cert"
140744,,vfs,2002-08-20 02:57:02,69.0,65.0,422.0,1533.0,24.1,693.0,10.7,...,1,30,4,Version2,AV4Z-jElG0XxMwG_Vh5F,14.0,,,AV4Z-jElG0XxMwG_Vh5F,"suspicious, error-handling, cert"
140745,,vfs,2002-07-19 11:54:15,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,1,30,4,Version2,AV4Z-fA0G0XxMwG_Vh47,14.0,,,,
140746,,vfs,2002-07-18 16:47:24,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,1,30,4,Version2,AV4Z-bDZG0XxMwG_Vh4x,14.0,,,,


In [16]:
pd.set_option('display.max_rows', None)
missing_values = dfm_i.isnull().sum()
print("Missing Values per Column:")
print(missing_values)
pd.reset_option('display.max_rows')

Missing Values per Column:
COMMIT_HASH                                  63079
PROJECT_ID                                       0
SQ_ANALYSIS_DATE                                 0
CLASSES                                         20
FILES                                           18
FUNCTIONS                                       20
COMMENT_LINES                                   18
COMMENT_LINES_DENSITY                           18
COMPLEXITY                                      20
FILE_COMPLEXITY                                 20
CLASS_COMPLEXITY                                20
FUNCTION_COMPLEXITY                             20
FUNCTION_COMPLEXITY_DISTRIBUTION              9875
FILE_COMPLEXITY_DISTRIBUTION                    20
DUPLICATED_LINES                                 0
DUPLICATED_BLOCKS                                0
DUPLICATED_FILES                                 0
DUPLICATED_LINES_DENSITY                        18
VIOLATIONS                                       0
BLOC

Is there overlap of tags for the same analysis?

In [17]:
# No.
dfm_i[dfm_i['TAGS_x'].notnull() & dfm_i['TAGS_y'].notnull()]

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,RELIABILITY_RATING,SECURITY_REMEDIATION_EFFORT,SECURITY_RATING,database,ANALYSIS_KEY,DIRECTORIES,creationCommitHash,TAGS_x,CREATION_ANALYSIS_KEY,TAGS_y


## Bringing together the tags
By joining the dataframe containing SonarQube measures with the two different tag dataframes (one for each version), there now are two different TAGS columns which need to be filled into one to extract unique analysis for which there are new code smells.

In [18]:
# since TAGS_x has more values, set TAGS = TAGS_x
dfm_i['TAGS'] = dfm_i['TAGS_x']

# for the missing values of TAGS, set TAGS_y if there is a value
dfm_i['TAGS'] = dfm_i['TAGS'].fillna(dfm_i['TAGS_y'])

In [19]:
pd.set_option('display.max_rows', None)
missing_values = dfm_i.isnull().sum()
print("Missing Values per Column:")
print(missing_values)
pd.reset_option('display.max_rows')

Missing Values per Column:
COMMIT_HASH                                  63079
PROJECT_ID                                       0
SQ_ANALYSIS_DATE                                 0
CLASSES                                         20
FILES                                           18
FUNCTIONS                                       20
COMMENT_LINES                                   18
COMMENT_LINES_DENSITY                           18
COMPLEXITY                                      20
FILE_COMPLEXITY                                 20
CLASS_COMPLEXITY                                20
FUNCTION_COMPLEXITY                             20
FUNCTION_COMPLEXITY_DISTRIBUTION              9875
FILE_COMPLEXITY_DISTRIBUTION                    20
DUPLICATED_LINES                                 0
DUPLICATED_BLOCKS                                0
DUPLICATED_FILES                                 0
DUPLICATED_LINES_DENSITY                        18
VIOLATIONS                                       0
BLOC

## Select variables for model and drop duplicated rows
Similarly as for the prediction for the amount of code smells, the same variables are selected except for code smells and keeping the identifier columns to remove duplicates and analysis dates.

In [20]:
variable_list = [
    'PROJECT_ID',
    'SQ_ANALYSIS_DATE',
    'COMMIT_HASH',
    'ANALYSIS_KEY',
    'CLASSES',
    'FILES',
    'LINES',
    'NCLOC',
    'PACKAGE',
    'STATEMENTS',
    'FUNCTIONS',
    'COMMENT_LINES',
    'COMPLEXITY',
    'CLASS_COMPLEXITY',
    'FUNCTION_COMPLEXITY',
    'COGNITIVE_COMPLEXITY',
    'LINES_TO_COVER',
    'UNCOVERED_LINES',
    'DUPLICATED_LINES',
    'DUPLICATED_BLOCKS',
    'DUPLICATED_FILES',
    'COMMENT_LINES_DENSITY',
    'DUPLICATED_LINES_DENSITY',
    'TAGS'
]

dfm_i = dfm_i[variable_list]
dfm_i

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,COMMIT_HASH,ANALYSIS_KEY,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,...,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY,TAGS
0,accumulo,2008-07-07 14:52:05,e0880e263e4bf8662ba3848405200473a25dfc9f,,2108.0,1103.0,263680.0,203873.0,164,105125.0,...,2.5,39453.0,121105.0,121105.0,46445,2410,207,6.2,17.6,"cwe, bad-practice, unused, cert, suspicious, j..."
1,accumulo,2008-07-07 12:31:47,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,,2108.0,1103.0,262753.0,202956.0,164,105125.0,...,2.5,39453.0,121105.0,121105.0,46301,2408,205,6.2,17.6,
2,accumulo,2008-07-05 18:54:27,2032ebbd0ed90734da39ca238bbd10dee24d0030,,2108.0,1103.0,262753.0,202956.0,164,105125.0,...,2.5,39453.0,121105.0,121105.0,46301,2408,205,6.2,17.6,
3,accumulo,2008-07-03 20:21:40,de297d4932e08625a5df146f0802041bb5aeb892,,2108.0,1103.0,263643.0,203841.0,164,105125.0,...,2.5,39453.0,121105.0,121105.0,46445,2410,207,6.2,17.6,
4,accumulo,2008-07-02 00:12:36,34efaae87639a83b60fdb7274de4b45051025a3a,,2108.0,1103.0,263639.0,203837.0,164,105125.0,...,2.5,39453.0,121105.0,121105.0,46445,2410,207,6.2,17.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,vfs,2002-08-20 06:10:50,,AV4Z-nYtG0XxMwG_Vh5b,69.0,65.0,8859.0,4814.0,14,1787.0,...,1.6,445.0,2124.0,2124.0,66,4,1,24.2,0.7,"suspicious, error-handling, cert"
140744,vfs,2002-08-20 02:57:02,,AV4Z-jElG0XxMwG_Vh5F,69.0,65.0,8867.0,4835.0,14,1794.0,...,1.6,449.0,2131.0,2131.0,66,4,1,24.1,0.7,"suspicious, error-handling, cert"
140745,vfs,2002-07-19 11:54:15,,AV4Z-fA0G0XxMwG_Vh47,69.0,65.0,8771.0,4761.0,14,1778.0,...,1.6,442.0,2104.0,2104.0,66,4,1,24.1,0.8,
140746,vfs,2002-07-18 16:47:24,,AV4Z-bDZG0XxMwG_Vh4x,69.0,65.0,8771.0,4761.0,14,1778.0,...,1.6,442.0,2104.0,2104.0,66,4,1,24.1,0.8,


In [21]:
dfm_i = dfm_i.drop_duplicates()

# drop identifier columnms
dfm_i = dfm_i.drop(columns = ['COMMIT_HASH', 'ANALYSIS_KEY'])              
dfm_i

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY,TAGS
0,accumulo,2008-07-07 14:52:05,2108.0,1103.0,263680.0,203873.0,164,105125.0,17295.0,13509.0,...,2.5,39453.0,121105.0,121105.0,46445,2410,207,6.2,17.6,"cwe, bad-practice, unused, cert, suspicious, j..."
1,accumulo,2008-07-07 12:31:47,2108.0,1103.0,262753.0,202956.0,164,105125.0,17295.0,13507.0,...,2.5,39453.0,121105.0,121105.0,46301,2408,205,6.2,17.6,
2,accumulo,2008-07-05 18:54:27,2108.0,1103.0,262753.0,202956.0,164,105125.0,17295.0,13507.0,...,2.5,39453.0,121105.0,121105.0,46301,2408,205,6.2,17.6,
3,accumulo,2008-07-03 20:21:40,2108.0,1103.0,263643.0,203841.0,164,105125.0,17295.0,13507.0,...,2.5,39453.0,121105.0,121105.0,46445,2410,207,6.2,17.6,
4,accumulo,2008-07-02 00:12:36,2108.0,1103.0,263639.0,203837.0,164,105125.0,17295.0,13507.0,...,2.5,39453.0,121105.0,121105.0,46445,2410,207,6.2,17.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,vfs,2002-08-20 06:10:50,69.0,65.0,8859.0,4814.0,14,1787.0,425.0,1536.0,...,1.6,445.0,2124.0,2124.0,66,4,1,24.2,0.7,"suspicious, error-handling, cert"
140744,vfs,2002-08-20 02:57:02,69.0,65.0,8867.0,4835.0,14,1794.0,422.0,1533.0,...,1.6,449.0,2131.0,2131.0,66,4,1,24.1,0.7,"suspicious, error-handling, cert"
140745,vfs,2002-07-19 11:54:15,69.0,65.0,8771.0,4761.0,14,1778.0,421.0,1513.0,...,1.6,442.0,2104.0,2104.0,66,4,1,24.1,0.8,
140746,vfs,2002-07-18 16:47:24,69.0,65.0,8771.0,4761.0,14,1778.0,421.0,1513.0,...,1.6,442.0,2104.0,2104.0,66,4,1,24.1,0.8,


## Select only rows for which tags are filled

In [22]:
dfm_i = dfm_i[dfm_i['TAGS'].notnull()]
dfm_i

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY,TAGS
0,accumulo,2008-07-07 14:52:05,2108.0,1103.0,263680.0,203873.0,164,105125.0,17295.0,13509.0,...,2.5,39453.0,121105.0,121105.0,46445,2410,207,6.2,17.6,"cwe, bad-practice, unused, cert, suspicious, j..."
7,accumulo,2008-07-01 18:16:28,2108.0,1103.0,263702.0,203883.0,164,105150.0,17299.0,13509.0,...,2.5,39454.0,121139.0,121139.0,46445,2410,207,6.2,17.6,"error-handling, cwe, cert"
21,accumulo,2008-06-12 17:11:34,2108.0,1131.0,278798.0,214326.0,164,111432.0,18092.0,15204.0,...,2.6,39465.0,127191.0,127191.0,46581,2416,210,6.6,16.7,design
23,accumulo,2008-06-10 12:05:27,2107.0,1130.0,278644.0,214193.0,164,111350.0,18090.0,15204.0,...,2.6,39428.0,127106.0,127106.0,46581,2416,210,6.6,16.7,convention
25,accumulo,2008-06-10 11:35:56,2107.0,1130.0,278644.0,214194.0,164,111349.0,18091.0,15204.0,...,2.6,39428.0,127107.0,127107.0,46581,2416,210,6.6,16.7,convention
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140737,vfs,2002-08-21 01:40:45,73.0,69.0,9169.0,5005.0,16,1837.0,440.0,1563.0,...,1.6,457.0,2185.0,2185.0,66,4,1,23.8,0.7,"error-handling, cwe, code-smell, cert, antipat..."
140738,vfs,2002-08-21 01:39:47,72.0,68.0,9043.0,4915.0,15,1803.0,436.0,1553.0,...,1.6,446.0,2146.0,2146.0,66,4,1,24.0,0.7,cwe
140743,vfs,2002-08-20 06:10:50,69.0,65.0,8859.0,4814.0,14,1787.0,425.0,1536.0,...,1.6,445.0,2124.0,2124.0,66,4,1,24.2,0.7,"suspicious, error-handling, cert"
140744,vfs,2002-08-20 02:57:02,69.0,65.0,8867.0,4835.0,14,1794.0,422.0,1533.0,...,1.6,449.0,2131.0,2131.0,66,4,1,24.1,0.7,"suspicious, error-handling, cert"


In [23]:
len(dfm_i['PROJECT_ID'].unique())

38

Only for 38 / 39 projects there are tags.

## Cleaning up tags
Through analysis it has been shown that the distribution of tags is very unsimilar and some tags describe multiple issues instead of one specifically. To ensure the label is consistent, tags that barely appear should be removed and tags that describe multiple issues should be removed.

In [24]:
all_categories = []

for category_string in dfm_i['TAGS']:
    # handle missing values
    if pd.isna(category_string):
        continue
    # split the string by comma into a list and add to all_categories
    individual_tags = [tag.strip() for tag in category_string.split(',')]
    all_categories.extend(individual_tags)

# count categories in all_categories
category_counts = Counter(all_categories)
category_counts

Counter({'cert': 8207,
         'error-handling': 6544,
         'convention': 6099,
         'cwe': 5172,
         'suspicious': 4767,
         'design': 4757,
         'pitfall': 4455,
         'brain-overload': 4329,
         'unused': 3787,
         'misra': 3411,
         'bad-practice': 3181,
         'code-smell': 2869,
         'antipattern': 2869,
         'clumsy': 2815,
         'redundant': 1235,
         'performance': 1098,
         'style': 616,
         'obsolete': 608,
         'confusing': 523,
         'java8': 460,
         'psr2': 219,
         'lock-in': 75,
         '': 62,
         'html5': 52,
         'serialization': 47,
         'unpredictable': 44,
         'multi-threading': 41,
         'browser-compatibility': 39,
         'format': 32,
         'sql': 6,
         'cross-browser': 1,
         'user-experience': 1})

Tags that are built-in are used ([Built-In Rule Tags SonarQube](https://docs.sonarsource.com/sonarqube-server/latest/user-guide/rules/built-in-rule-tags/)). <br>
Of these tags, some describe a bigger set of issues. These are getting excluded as the focus lies on specifying concrete issues. The tags present here are the following: 'cert', 'cwe', 'misra', 'psr2' and 'code-smell'. <br>
There are rules that aren't properly explained by SonarQube but are to be taken by word ([Rule Tags Issue in SonarQube Forum](https://community.sonarsource.com/t/sonarqube-rule-tags/1251)). These are used if they are understandable enough. <br>
Style for example encases multiple rules that are very close to convention at times or fraudulent ([Style Tag Issue in SonarQube Forum](https://community.sonarsource.com/t/style-tag-applied-inconsistently/37939)).<br>
Tags which occur less than a 100 times are considered the tail of the distribution and removed. <br>
After this selection, the remaining tags are: 'error-handling', 'convention', 'suspicious', 'pitfall', 'brain-overload', 'unused', 'bad-practice', 'clumsy', 'antipattern', 'redundant', 'performance', 'obsolete' and 'confusing'.

In [25]:
valid_tags = {'error-handling','convention','suspicious','pitfall','design','brain-overload','unused','bad-practice','clumsy',
         'antipattern','redundant','performance','obsolete','confusing'}

# transform tags for analysis to lists
dfm_i.loc[:, 'TAGS'] = dfm_i['TAGS'].str.split(',')

# filter tags present in analysis based on validity
dfm_i.loc[:, 'TAGS'] = dfm_i['TAGS'].apply(
    lambda tags: [tag.strip() for tag in tags if tag.strip() in valid_tags]
)

dfm_i

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY,TAGS
0,accumulo,2008-07-07 14:52:05,2108.0,1103.0,263680.0,203873.0,164,105125.0,17295.0,13509.0,...,2.5,39453.0,121105.0,121105.0,46445,2410,207,6.2,17.6,"[bad-practice, unused, suspicious, confusing, ..."
7,accumulo,2008-07-01 18:16:28,2108.0,1103.0,263702.0,203883.0,164,105150.0,17299.0,13509.0,...,2.5,39454.0,121139.0,121139.0,46445,2410,207,6.2,17.6,[error-handling]
21,accumulo,2008-06-12 17:11:34,2108.0,1131.0,278798.0,214326.0,164,111432.0,18092.0,15204.0,...,2.6,39465.0,127191.0,127191.0,46581,2416,210,6.6,16.7,[design]
23,accumulo,2008-06-10 12:05:27,2107.0,1130.0,278644.0,214193.0,164,111350.0,18090.0,15204.0,...,2.6,39428.0,127106.0,127106.0,46581,2416,210,6.6,16.7,[convention]
25,accumulo,2008-06-10 11:35:56,2107.0,1130.0,278644.0,214194.0,164,111349.0,18091.0,15204.0,...,2.6,39428.0,127107.0,127107.0,46581,2416,210,6.6,16.7,[convention]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140737,vfs,2002-08-21 01:40:45,73.0,69.0,9169.0,5005.0,16,1837.0,440.0,1563.0,...,1.6,457.0,2185.0,2185.0,66,4,1,23.8,0.7,"[error-handling, antipattern]"
140738,vfs,2002-08-21 01:39:47,72.0,68.0,9043.0,4915.0,15,1803.0,436.0,1553.0,...,1.6,446.0,2146.0,2146.0,66,4,1,24.0,0.7,[]
140743,vfs,2002-08-20 06:10:50,69.0,65.0,8859.0,4814.0,14,1787.0,425.0,1536.0,...,1.6,445.0,2124.0,2124.0,66,4,1,24.2,0.7,"[suspicious, error-handling]"
140744,vfs,2002-08-20 02:57:02,69.0,65.0,8867.0,4835.0,14,1794.0,422.0,1533.0,...,1.6,449.0,2131.0,2131.0,66,4,1,24.1,0.7,"[suspicious, error-handling]"


In [26]:
# filter rows for which TAGS is empty ([])
dfm_i[~dfm_i['TAGS'].apply(bool)]

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY,TAGS
71,accumulo,2008-05-13 22:21:22,2111.0,1135.0,288668.0,209335.0,164,111635.0,18132.0,15332.0,...,2.5,39513.0,124308.0,124308.0,46443,2422,209,6.8,16.1,[]
1556,accumulo,2012-01-19 12:18:19,4114.0,1284.0,409278.0,303506.0,176,156785.0,27822.0,21390.0,...,2.5,54500.0,179581.0,179581.0,73618,3255,226,6.6,18.0,[]
1570,accumulo,2012-01-17 22:35:08,4114.0,1284.0,409289.0,303517.0,176,156794.0,27822.0,21390.0,...,2.5,54506.0,179590.0,179590.0,73618,3255,226,6.6,18.0,[]
1999,accumulo,2011-10-11 15:00:15,4147.0,1296.0,415080.0,308336.0,177,159637.0,28122.0,21630.0,...,2.5,55519.0,182844.0,182844.0,76159,3227,228,6.6,18.3,[]
2368,accumulo,2011-09-03 21:22:56,4274.0,1365.0,429606.0,319073.0,186,165099.0,28915.0,22393.0,...,2.5,57452.0,189042.0,189042.0,78851,3335,241,6.6,18.4,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140648,vfs,2002-11-23 00:11:53,124.0,111.0,19568.0,7811.0,21,2607.0,746.0,2342.0,...,1.6,716.0,3198.0,3198.0,122,6,3,23.1,0.6,[]
140661,vfs,2002-11-17 03:39:21,119.0,106.0,18862.0,7614.0,21,2602.0,725.0,2246.0,...,1.6,707.0,3178.0,3178.0,122,6,3,22.8,0.6,[]
140669,vfs,2002-11-01 03:24:03,115.0,102.0,18156.0,7336.0,21,2524.0,692.0,2176.0,...,1.6,684.0,3078.0,3078.0,122,6,3,22.9,0.7,[]
140716,vfs,2002-10-21 02:52:41,91.0,84.0,11148.0,6189.0,19,2176.0,560.0,1824.0,...,1.7,602.0,2617.0,2617.0,150,6,3,22.8,1.3,[]


In [27]:
# filter rows for which TAGS is filled
dfm_i = dfm_i[dfm_i['TAGS'].apply(bool)]

In [28]:
len(dfm_i['PROJECT_ID'].unique())

38

Only 38 / 39 projects have analysis that belong to the subset of valid tags.

In [29]:
# transform tags back to strings
dfm_i.loc[:, 'TAGS'] = dfm_i['TAGS'].apply(lambda x: ', '.join(x))

In [30]:
dfm_i.to_csv(os.path.join(data_dir, 'measures+tags.csv'), index = False)