# Understanding how to use the SonarQube distribution software metrics

In [1]:
import pandas as pd
import os

In [2]:
# data import
current_dir = os.getcwd()

# construct path to the project data folder
data_dir = os.path.join(current_dir, '..', '..', 'Data', 'Sonar_Measures')

# load SonarQube measure data of version 1 and 2, cleaned
df = pd.read_csv(os.path.join(data_dir, 'sonar_measures_v1_v2_no_statics.csv'), low_memory=False)

df

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,CLASS_COMPLEXITY,...,QUALITY_GATE_DETAILS,QUALITY_PROFILES,NEW_SQALE_DEBT_RATIO,VULNERABILITIES,RELIABILITY_REMEDIATION_EFFORT,RELIABILITY_RATING,SECURITY_REMEDIATION_EFFORT,SECURITY_RATING,database,DIRECTORIES
0,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,20.4,...,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.000000,838,7322,5,9505,4,Version1,
1,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,...,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.222222,838,7081,5,9505,4,Version1,
2,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,...,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.222222,838,7081,5,9505,4,Version1,
3,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,...,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.674560,838,7322,5,9505,4,Version1,
4,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,...,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.671668,838,7322,5,9505,4,Version1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,vfs,2002-08-20 06:10:50,69.0,65.0,425.0,1536.0,24.2,690.0,10.6,10.0,...,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",2.025316,1,0,1,30,4,Version2,14.0
140744,vfs,2002-08-20 02:57:02,69.0,65.0,422.0,1533.0,24.1,693.0,10.7,10.0,...,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",2.184236,1,0,1,30,4,Version2,14.0
140745,vfs,2002-07-19 11:54:15,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,10.0,...,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",2.613333,1,0,1,30,4,Version2,14.0
140746,vfs,2002-07-18 16:47:24,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,10.0,...,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",2.613333,1,0,1,30,4,Version2,14.0


## Finding out all unique languages occurring in the dataset

In [3]:
unique_languages = set()

for row in df['NCLOC_LANGUAGE_DISTRIBUTION'].dropna():
    # get list of languages for one row
    pairs = row.split(';')
    for pair in pairs:
        # for valid strings ('css=92') extract language used
        if '=' in pair:
            lang = pair.split('=')[0]
            unique_languages.add(lang)

print(f"Unique languages used in the projects: {unique_languages}")

Unique languages used in the projects: {'css', 'java', 'scss', 'less', 'web', 'js', 'php', 'xml'}


## Creating columns that contain the amount of code lines for each language

In [4]:
all_languages = {'less', 'css', 'scss', 'xml', 'php', 'js', 'web', 'java'}

# function to parse the distribution string into a dictionary
def parse_language_distribution(distribution_string):
    # for missing values return empty dict
    if pd.isna(distribution_string) or distribution_string == '':
        return {}

    # initialise dict
    language_data = {}

    # replace , with ; to catch rows where languages are split by , instead of ;
    distribution_string = distribution_string.replace(',', ';')

    # create list of languages present in a row
    pairs = distribution_string.split(';')

    # extract language and amount of code lines (value) for each item in pairs and add it to the dict
    for pair in pairs:
        if '=' in pair:
            language, value = pair.split('=')
            language_data[language] = int(value)
    return language_data

# apply parsing function to distribution column
df['parsed_languages'] = df['NCLOC_LANGUAGE_DISTRIBUTION'].apply(parse_language_distribution)

# create new columns for each language and fill them with the according values
# set 0 if a language is not present in the distribution
for lang in all_languages:
    df[lang] = df['parsed_languages'].apply(lambda x: x.get(lang, 0))

# drop helper column 'parsed_languages'
df = df.drop(columns=['parsed_languages'])
df

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,CLASS_COMPLEXITY,...,database,DIRECTORIES,css,scss,java,less,web,js,php,xml
0,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,20.4,...,Version1,,311,0,197426,0,1674,5,0,4457
1,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,...,Version1,,311,0,197426,0,789,5,0,4425
2,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,...,Version1,,311,0,197426,0,789,5,0,4425
3,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,...,Version1,,311,0,197426,0,1674,5,0,4425
4,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,...,Version1,,311,0,197426,0,1674,5,0,4421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,vfs,2002-08-20 06:10:50,69.0,65.0,425.0,1536.0,24.2,690.0,10.6,10.0,...,Version2,14.0,0,0,4814,0,0,0,0,0
140744,vfs,2002-08-20 02:57:02,69.0,65.0,422.0,1533.0,24.1,693.0,10.7,10.0,...,Version2,14.0,0,0,4835,0,0,0,0,0
140745,vfs,2002-07-19 11:54:15,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,10.0,...,Version2,14.0,0,0,4761,0,0,0,0,0
140746,vfs,2002-07-18 16:47:24,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,10.0,...,Version2,14.0,0,0,4761,0,0,0,0,0


## Are the languages filled properly for missing distributions?

In [9]:
df[df['NCLOC_LANGUAGE_DISTRIBUTION'].isna()][['NCLOC_LANGUAGE_DISTRIBUTION', 'less', 'css', 'scss', 'xml', 'php', 'js', 'web', 'java']]

Unnamed: 0,NCLOC_LANGUAGE_DISTRIBUTION,less,css,scss,xml,php,js,web,java
83596,,0,0,0,0,0,0,0,0
103801,,0,0,0,0,0,0,0,0
104417,,0,0,0,0,0,0,0,0
104418,,0,0,0,0,0,0,0,0
107114,,0,0,0,0,0,0,0,0
107115,,0,0,0,0,0,0,0,0
110583,,0,0,0,0,0,0,0,0
110584,,0,0,0,0,0,0,0,0
118791,,0,0,0,0,0,0,0,0
118792,,0,0,0,0,0,0,0,0


In [6]:
df.to_csv(os.path.join(data_dir, 'sonar_measures_v1_v2_cleaned+ncloc_distr.csv'), index = False)