# Variable selection based on missingness
The goal of this analysis is to better understand which variables are useful for a model based on the amount of missing values they have. Variables that are not useful for a model in that regard are dropped.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
# data import
current_dir = os.getcwd()

# construct path to the project data folder
data_dir = os.path.join(current_dir, '..', '..', 'Data','Sonar_Measures')

# load SonarQube measure data (without duplicates)
df = pd.read_csv(os.path.join(data_dir, 'sonar_measures_v1_v2_no_duplicates.csv'), low_memory=False)

df

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,sg_i.JAVA_CYCLIC_PACKAGES_PERCENT,sg_i.MAX_MODULE_NCCD,sg_i.ARCHITECTURE_FEATURE_AVAILABLE,sg_i.NUMBER_OF_ISSUES,sg_i.NUMBER_OF_CRITICAL_ISSUES_WITHOUT_RESOLUTION,sg_i.VIOLATING_COMPONENTS_PERCENT,sg_i.UNASSIGNED_COMPONENTS_PERCENT,sg_i.NUMBER_OF_THRESHOLD_VIOLATIONS,sg_i.NUMBER_OF_WORKSPACE_WARNINGS,sg_i.NUMBER_OF_IGNORED_CRITICAL_ISSUES
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,...,,,,,,,,,,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,,vfs,2002-08-20 06:10:50,69.0,65.0,425.0,1536.0,24.2,690.0,10.6,...,,,,,,,,,,
140744,,vfs,2002-08-20 02:57:02,69.0,65.0,422.0,1533.0,24.1,693.0,10.7,...,,,,,,,,,,
140745,,vfs,2002-07-19 11:54:15,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,,,,,,,,,,
140746,,vfs,2002-07-18 16:47:24,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,,,,,,,,,,


## Missing Values
When looking at the SonarQube measures table in the database of version 2, it appears that there are a lot more variables than for version 1. Furthermore, there are a lot of missing values for these extra variables. This needs to be evaluated to see, whether these variables are usable for a model at all.

In [3]:
pd.set_option('display.max_rows', None)
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values)
pd. reset_option('display.max_rows')

Missing Values per Column:
COMMIT_HASH                                                     63079
PROJECT_ID                                                          0
SQ_ANALYSIS_DATE                                                    0
CLASSES                                                            20
FILES                                                              18
FUNCTIONS                                                          20
COMMENT_LINES                                                      18
COMMENT_LINES_DENSITY                                              18
COMPLEXITY                                                         20
FILE_COMPLEXITY                                                    20
CLASS_COMPLEXITY                                                   20
FUNCTION_COMPLEXITY                                                20
FUNCTION_COMPLEXITY_DISTRIBUTION                                 9875
FILE_COMPLEXITY_DISTRIBUTION                                   

There are 244 variables in the dataset. For over half of the variables, there are all or almost all values unfilled. Therefore, these are not usable for a model and can be removed. First, all variables where more than 140000 values are missing are removed.

In [4]:
# map variables for which the data is completely or almost completely missing
threshold = 140000
columns_to_drop = missing_values[missing_values > threshold].index.tolist()

# drop the identified columns
print(f"Dropping columns: {columns_to_drop}") # Informative print statement
df = df.drop(columns=columns_to_drop)



In [5]:
pd.set_option('display.max_columns', None)
df

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,CLASS_COMPLEXITY,FUNCTION_COMPLEXITY,FUNCTION_COMPLEXITY_DISTRIBUTION,FILE_COMPLEXITY_DISTRIBUTION,COVERAGE,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,DUPLICATED_LINES_DENSITY,VIOLATIONS,BLOCKER_VIOLATIONS,CRITICAL_VIOLATIONS,INFO_VIOLATIONS,FALSE_POSITIVE_ISSUES,CONFIRMED_ISSUES,LAST_COMMIT_DATE,CODE_SMELLS,BUGS,EFFORT_TO_REACH_MAINTAINABILITY_RATING_A,AFFERENT_COUPLINGS,EFFERENT_COUPLINGS,COGNITIVE_COMPLEXITY,LINES,NCLOC,NCLOC_LANGUAGE_DISTRIBUTION,LINES_TO_COVER,LINE_COVERAGE,MAJOR_VIOLATIONS,MINOR_VIOLATIONS,OPEN_ISSUES,SQALE_RATING,NUMBER_OF_CLASSES_AND_INTERFACES,MISSING_PACKAGE_INFO,PACKAGE,STATEMENTS,UNCOVERED_LINES,REOPENED_ISSUES,SQALE_INDEX,DEVELOPMENT_COST,SQALE_DEBT_RATIO,ALERT_STATUS,QUALITY_GATE_DETAILS,QUALITY_PROFILES,NEW_SQALE_DEBT_RATIO,VULNERABILITIES,RELIABILITY_REMEDIATION_EFFORT,RELIABILITY_RATING,SECURITY_REMEDIATION_EFFORT,SECURITY_RATING,WONT_FIX_ISSUES,PACKAGE_DEPENDENCY_CYCLES,database,ANALYSIS_KEY,NEW_SQALE_DEBT_RATION,DIRECTORIES
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46445,2410,207,17.6,18314,142,893,80,0,0,1317690000000.0,17012,464,0,0,0,39453.0,263680.0,203873.0,css=311;java=197426;js=5;web=1674;xml=4457,121105.0,0.0,7310,9889,18314,1,0,164,164,105125.0,121105.0,0,212384,6116190,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.000000,838,7322,5,9505,4,0,0,Version1,,,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46301,2408,205,17.6,18169,142,893,80,0,0,1317750000000.0,16987,344,0,0,0,39453.0,262753.0,202956.0,css=311;java=197426;js=5;web=789;xml=4425,121105.0,0.0,7164,9890,18169,1,0,164,164,105125.0,121105.0,0,212200,6088680,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.222222,838,7081,5,9505,4,0,0,Version1,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46301,2408,205,17.6,18169,142,893,80,0,0,1317750000000.0,16987,344,0,0,0,39453.0,262753.0,202956.0,css=311;java=197426;js=5;web=789;xml=4425,121105.0,0.0,7164,9890,18169,1,0,164,164,105125.0,121105.0,0,212200,6088680,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.222222,838,7081,5,9505,4,0,0,Version1,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46445,2410,207,17.6,18315,142,893,80,0,0,1317760000000.0,17013,464,0,0,0,39453.0,263643.0,203841.0,css=311;java=197426;js=5;web=1674;xml=4425,121105.0,0.0,7310,9890,18315,1,0,164,164,105125.0,121105.0,0,212385,6115230,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.674560,838,7322,5,9505,4,0,0,Version1,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46445,2410,207,17.6,18315,142,893,80,0,0,1317840000000.0,17013,464,0,0,0,39453.0,263639.0,203837.0,css=311;java=197426;js=5;web=1674;xml=4421,121105.0,0.0,7310,9890,18315,1,0,164,164,105125.0,121105.0,0,212385,6115110,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.671668,838,7322,5,9505,4,0,0,Version1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,,vfs,2002-08-20 06:10:50,69.0,65.0,425.0,1536.0,24.2,690.0,10.6,10.0,1.6,"1=249,2=65,4=21,6=12,8=3,10=1,12=5","0=39,5=11,10=6,20=3,30=4,60=1,90=1",0.0,66,4,1,0.7,269,0,20,33,0,0,2002-08-20 06:10:50,268,0,0,0,0,445.0,8859.0,4814.0,java=4814,2124.0,0.0,105,111,269,1,0,14,14,1787.0,2124.0,0,4293,144420,3.0,ERROR,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",,1,0,1,30,4,0,0,Version2,AV4Z-nYtG0XxMwG_Vh5b,2.025316,14.0
140744,,vfs,2002-08-20 02:57:02,69.0,65.0,422.0,1533.0,24.1,693.0,10.7,10.0,1.6,"1=248,2=66,4=21,6=12,8=3,10=1,12=5","0=39,5=11,10=6,20=2,30=5,60=1,90=1",0.0,66,4,1,0.7,270,0,19,33,0,0,2002-08-20 02:57:02,269,0,0,0,0,449.0,8867.0,4835.0,java=4835,2131.0,0.0,105,113,270,1,0,14,14,1794.0,2131.0,0,4290,145050,3.0,ERROR,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",,1,0,1,30,4,0,0,Version2,AV4Z-jElG0XxMwG_Vh5F,2.184236,14.0
140745,,vfs,2002-07-19 11:54:15,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,10.0,1.6,"1=248,2=65,4=21,6=12,8=3,10=1,12=5","0=40,5=10,10=6,20=2,30=5,60=1,90=1",0.0,66,4,1,0.8,267,0,17,34,0,0,2002-07-17 10:23:44,266,0,0,0,0,442.0,8771.0,4761.0,java=4761,2104.0,0.0,105,111,267,1,0,14,14,1778.0,2104.0,0,4175,142830,2.9,ERROR,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",,1,0,1,30,4,0,0,Version2,AV4Z-fA0G0XxMwG_Vh47,2.613333,14.0
140746,,vfs,2002-07-18 16:47:24,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,10.0,1.6,"1=248,2=65,4=21,6=12,8=3,10=1,12=5","0=40,5=10,10=6,20=2,30=5,60=1,90=1",0.0,66,4,1,0.8,267,0,17,34,0,0,2002-07-17 10:23:44,266,0,0,0,0,442.0,8771.0,4761.0,java=4761,2104.0,0.0,105,111,267,1,0,14,14,1778.0,2104.0,0,4175,142830,2.9,ERROR,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",,1,0,1,30,4,0,0,Version2,AV4Z-bDZG0XxMwG_Vh4x,2.613333,14.0


The dataframe has been reduced from 244 columns to only 65.

In [6]:
pd. reset_option('display.max_columns')
df.columns

Index(['COMMIT_HASH', 'PROJECT_ID', 'SQ_ANALYSIS_DATE', 'CLASSES', 'FILES',
       'FUNCTIONS', 'COMMENT_LINES', 'COMMENT_LINES_DENSITY', 'COMPLEXITY',
       'FILE_COMPLEXITY', 'CLASS_COMPLEXITY', 'FUNCTION_COMPLEXITY',
       'FUNCTION_COMPLEXITY_DISTRIBUTION', 'FILE_COMPLEXITY_DISTRIBUTION',
       'COVERAGE', 'DUPLICATED_LINES', 'DUPLICATED_BLOCKS', 'DUPLICATED_FILES',
       'DUPLICATED_LINES_DENSITY', 'VIOLATIONS', 'BLOCKER_VIOLATIONS',
       'CRITICAL_VIOLATIONS', 'INFO_VIOLATIONS', 'FALSE_POSITIVE_ISSUES',
       'CONFIRMED_ISSUES', 'LAST_COMMIT_DATE', 'CODE_SMELLS', 'BUGS',
       'EFFORT_TO_REACH_MAINTAINABILITY_RATING_A', 'AFFERENT_COUPLINGS',
       'EFFERENT_COUPLINGS', 'COGNITIVE_COMPLEXITY', 'LINES', 'NCLOC',
       'NCLOC_LANGUAGE_DISTRIBUTION', 'LINES_TO_COVER', 'LINE_COVERAGE',
       'MAJOR_VIOLATIONS', 'MINOR_VIOLATIONS', 'OPEN_ISSUES', 'SQALE_RATING',
       'NUMBER_OF_CLASSES_AND_INTERFACES', 'MISSING_PACKAGE_INFO', 'PACKAGE',
       'STATEMENTS', 'UNCOVERED_LINE

In [7]:
pd.set_option('display.max_rows', None)
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values)
pd.reset_option('display.max_rows')

Missing Values per Column:
COMMIT_HASH                                 63079
PROJECT_ID                                      0
SQ_ANALYSIS_DATE                                0
CLASSES                                        20
FILES                                          18
FUNCTIONS                                      20
COMMENT_LINES                                  18
COMMENT_LINES_DENSITY                          18
COMPLEXITY                                     20
FILE_COMPLEXITY                                20
CLASS_COMPLEXITY                               20
FUNCTION_COMPLEXITY                            20
FUNCTION_COMPLEXITY_DISTRIBUTION             9875
FILE_COMPLEXITY_DISTRIBUTION                   20
COVERAGE                                       26
DUPLICATED_LINES                                0
DUPLICATED_BLOCKS                               0
DUPLICATED_FILES                                0
DUPLICATED_LINES_DENSITY                       18
VIOLATIONS             

## NEW_SQALE_DEBT_RATIO(N)
A lot of missing values are contained in the two variables NEW_SQALE_DEBT_RATIO and NEW_SQALE_DEBT_RATION. However, this is just a naming mistake between the two database versions, as the sum is exactly the number of analysis. The column name is adapted so that all values are properly written into NEW_SQALE_DEBT_RATIO.

In [8]:
df['NEW_SQALE_DEBT_RATIO'] = df['NEW_SQALE_DEBT_RATIO'].fillna(df['NEW_SQALE_DEBT_RATION'])
df = df.drop('NEW_SQALE_DEBT_RATION', axis=1)
pd.set_option('display.max_rows', None)
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values)
pd.reset_option('display.max_rows')

Missing Values per Column:
COMMIT_HASH                                 63079
PROJECT_ID                                      0
SQ_ANALYSIS_DATE                                0
CLASSES                                        20
FILES                                          18
FUNCTIONS                                      20
COMMENT_LINES                                  18
COMMENT_LINES_DENSITY                          18
COMPLEXITY                                     20
FILE_COMPLEXITY                                20
CLASS_COMPLEXITY                               20
FUNCTION_COMPLEXITY                            20
FUNCTION_COMPLEXITY_DISTRIBUTION             9875
FILE_COMPLEXITY_DISTRIBUTION                   20
COVERAGE                                       26
DUPLICATED_LINES                                0
DUPLICATED_BLOCKS                               0
DUPLICATED_FILES                                0
DUPLICATED_LINES_DENSITY                       18
VIOLATIONS             

## Identifier Columns
The remaining missing values are mostly in columns that help to identify the analysis across different tables in the database (COMMIT_HASH and ANALYSIS_KEY). Since these aren't metrics that can be used for a model, they are removed.

In [9]:
df = df.drop(['COMMIT_HASH', 'ANALYSIS_KEY'], axis=1)
pd.set_option('display.max_rows', None)
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values)
pd.reset_option('display.max_rows')

Missing Values per Column:
PROJECT_ID                                      0
SQ_ANALYSIS_DATE                                0
CLASSES                                        20
FILES                                          18
FUNCTIONS                                      20
COMMENT_LINES                                  18
COMMENT_LINES_DENSITY                          18
COMPLEXITY                                     20
FILE_COMPLEXITY                                20
CLASS_COMPLEXITY                               20
FUNCTION_COMPLEXITY                            20
FUNCTION_COMPLEXITY_DISTRIBUTION             9875
FILE_COMPLEXITY_DISTRIBUTION                   20
COVERAGE                                       26
DUPLICATED_LINES                                0
DUPLICATED_BLOCKS                               0
DUPLICATED_FILES                                0
DUPLICATED_LINES_DENSITY                       18
VIOLATIONS                                      0
BLOCKER_VIOLATIONS     

## Result
The dataframe has been cleaned of values that have too much missingness or are not useful for a model.

In [10]:
# save the updated dataset
df.to_csv(os.path.join(data_dir, 'sonar_measures_v1_v2_usable_vars.csv'), index = False)