# Variable selection based on missingness
The goal of this analysis is to better understand which variables are useful for a model based on the amount of missing values they have. Variables that are not useful for a model in that regard are dropped.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
# data import
current_dir = os.getcwd()

# construct path to the project data folder
data_dir = os.path.join(current_dir, '..', '..', 'Data','Sonar_Measures')

# load SonarQube measure data
df = pd.read_csv(os.path.join(data_dir, 'sonar_measures_v1_v2.csv'), low_memory=False)

df

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,sg_i.JAVA_CYCLIC_PACKAGES_PERCENT,sg_i.MAX_MODULE_NCCD,sg_i.ARCHITECTURE_FEATURE_AVAILABLE,sg_i.NUMBER_OF_ISSUES,sg_i.NUMBER_OF_CRITICAL_ISSUES_WITHOUT_RESOLUTION,sg_i.VIOLATING_COMPONENTS_PERCENT,sg_i.UNASSIGNED_COMPONENTS_PERCENT,sg_i.NUMBER_OF_THRESHOLD_VIOLATIONS,sg_i.NUMBER_OF_WORKSPACE_WARNINGS,sg_i.NUMBER_OF_IGNORED_CRITICAL_ISSUES
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,...,,,,,,,,,,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144635,,org.apache:zookeeper,2014-07-25 16:23:40,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,...,,,,,,,,,,
144636,,org.apache:zookeeper,2014-07-24 23:31:57,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,...,,,,,,,,,,
144637,,org.apache:zookeeper,2014-07-24 23:16:09,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144638,,org.apache:zookeeper,2014-07-24 22:58:38,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,


## Missing Values
When looking at the SonarQube measures table in the database of version 2, it appears that there are a lot more variables than for version 1. Furthermore, there are a lot of missing values for these extra variables. This needs to be evaluated to see, whether these variables are usable for a model at all.

In [3]:
pd.set_option('display.max_rows', None)
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values)
pd. reset_option('display.max_rows')

Missing Values per Column:
COMMIT_HASH                                                     66711
PROJECT_ID                                                          0
SQ_ANALYSIS_DATE                                                    0
CLASSES                                                            28
FILES                                                              26
FUNCTIONS                                                          28
COMMENT_LINES                                                      26
COMMENT_LINES_DENSITY                                              26
COMPLEXITY                                                         28
FILE_COMPLEXITY                                                    28
CLASS_COMPLEXITY                                                   28
FUNCTION_COMPLEXITY                                                28
FUNCTION_COMPLEXITY_DISTRIBUTION                                10474
FILE_COMPLEXITY_DISTRIBUTION                                   

There are 244 variables in the dataset. For over half of the variables, there are all or almost all values unfilled. Therefore, these are not usable for a model and can be removed. First, all variables where more than 144000 values are missing are removed.

In [4]:
# map variables for which the data is completely or almost completely missing
threshold = 144000
columns_to_drop = missing_values[missing_values > threshold].index.tolist()

# drop the identified columns
print(f"Dropping columns: {columns_to_drop}") # Informative print statement
df = df.drop(columns=columns_to_drop)



In [5]:
pd.set_option('display.max_columns', None)
df

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,CLASS_COMPLEXITY,FUNCTION_COMPLEXITY,FUNCTION_COMPLEXITY_DISTRIBUTION,FILE_COMPLEXITY_DISTRIBUTION,COVERAGE,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,DUPLICATED_LINES_DENSITY,VIOLATIONS,BLOCKER_VIOLATIONS,CRITICAL_VIOLATIONS,INFO_VIOLATIONS,FALSE_POSITIVE_ISSUES,CONFIRMED_ISSUES,LAST_COMMIT_DATE,CODE_SMELLS,BUGS,EFFORT_TO_REACH_MAINTAINABILITY_RATING_A,AFFERENT_COUPLINGS,EFFERENT_COUPLINGS,COGNITIVE_COMPLEXITY,LINES,NCLOC,NCLOC_LANGUAGE_DISTRIBUTION,LINES_TO_COVER,LINE_COVERAGE,MAJOR_VIOLATIONS,MINOR_VIOLATIONS,OPEN_ISSUES,SQALE_RATING,NUMBER_OF_CLASSES_AND_INTERFACES,MISSING_PACKAGE_INFO,PACKAGE,STATEMENTS,UNCOVERED_LINES,REOPENED_ISSUES,SQALE_INDEX,DEVELOPMENT_COST,SQALE_DEBT_RATIO,ALERT_STATUS,QUALITY_GATE_DETAILS,QUALITY_PROFILES,NEW_SQALE_DEBT_RATIO,VULNERABILITIES,RELIABILITY_REMEDIATION_EFFORT,RELIABILITY_RATING,SECURITY_REMEDIATION_EFFORT,SECURITY_RATING,WONT_FIX_ISSUES,PACKAGE_DEPENDENCY_CYCLES,database,ANALYSIS_KEY,NEW_SQALE_DEBT_RATION,DIRECTORIES
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46445,2410,207,17.6,18314,142,893,80,0,0,1317690000000.0,17012,464,0,0,0,39453.0,263680.0,203873.0,css=311;java=197426;js=5;web=1674;xml=4457,121105.0,0.0,7310,9889,18314,1,0,164,164,105125.0,121105.0,0,212384,6116190,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.000000,838,7322,5,9505,4,0,0,Version1,,,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46301,2408,205,17.6,18169,142,893,80,0,0,1317750000000.0,16987,344,0,0,0,39453.0,262753.0,202956.0,css=311;java=197426;js=5;web=789;xml=4425,121105.0,0.0,7164,9890,18169,1,0,164,164,105125.0,121105.0,0,212200,6088680,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.222222,838,7081,5,9505,4,0,0,Version1,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46301,2408,205,17.6,18169,142,893,80,0,0,1317750000000.0,16987,344,0,0,0,39453.0,262753.0,202956.0,css=311;java=197426;js=5;web=789;xml=4425,121105.0,0.0,7164,9890,18169,1,0,164,164,105125.0,121105.0,0,212200,6088680,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.222222,838,7081,5,9505,4,0,0,Version1,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46445,2410,207,17.6,18315,142,893,80,0,0,1317760000000.0,17013,464,0,0,0,39453.0,263643.0,203841.0,css=311;java=197426;js=5;web=1674;xml=4425,121105.0,0.0,7310,9890,18315,1,0,164,164,105125.0,121105.0,0,212385,6115230,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.674560,838,7322,5,9505,4,0,0,Version1,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,20.4,2.5,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0=285;5=213;10=234;20=120;30=99;60=39;90=71,0.0,46445,2410,207,17.6,18315,142,893,80,0,0,1317840000000.0,17013,464,0,0,0,39453.0,263639.0,203837.0,css=311;java=197426;js=5;web=1674;xml=4421,121105.0,0.0,7310,9890,18315,1,0,164,164,105125.0,121105.0,0,212385,6115110,3.5,ERROR,"{""level"":""ERROR"",""conditions"":[{""metric"":""bloc...","[{""key"":""css-sonar-way-41536"",""language"":""css""...",0.671668,838,7322,5,9505,4,0,0,Version1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144635,,org.apache:zookeeper,2014-07-25 16:23:40,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,12.3,2.1,"1=2505,2=915,4=253,6=128,8=48,10=25,12=78","0=130,5=93,10=71,20=43,30=47,60=13,90=23",0.0,5555,318,100,6.0,4639,17,501,69,0,0,2014-07-24 23:31:57,4217,32,0,0,0,8157.0,92175.0,61360.0,"java=57607,web=884,xml=2869",37106.0,0.0,1980,2072,4639,1,0,23,23,29060.0,37106.0,0,57239,1840800,3.1,ERROR,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",,390,174,5,9380,4,0,0,Version2,AWMblPr7O48jNFNDCAof,2.93578,28.0
144636,,org.apache:zookeeper,2014-07-24 23:31:57,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,12.3,2.1,"1=2505,2=915,4=253,6=128,8=48,10=25,12=78","0=130,5=93,10=71,20=43,30=47,60=13,90=23",0.0,5555,318,100,6.0,4639,17,501,69,0,0,2014-07-24 23:31:57,4217,32,0,0,0,8157.0,92175.0,61360.0,"java=57607,web=884,xml=2869",37106.0,0.0,1980,2072,4639,1,0,23,23,29060.0,37106.0,0,57239,1840800,3.1,ERROR,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",,390,174,5,9380,4,0,0,Version2,AWMbdJy4pxPbkMlK2_uE,2.93578,28.0
144637,,org.apache:zookeeper,2014-07-24 23:16:09,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,12.3,2.1,"1=2503,2=914,4=253,6=128,8=48,10=25,12=78","0=129,5=93,10=71,20=43,30=47,60=13,90=23",0.0,5507,316,100,6.0,4628,17,497,69,0,0,2014-07-24 22:58:38,4207,32,0,0,0,8154.0,92092.0,61300.0,"java=57547,web=884,xml=2869",37064.0,0.0,1979,2066,4628,1,0,23,23,29026.0,37064.0,0,57033,1839000,3.1,ERROR,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",,389,174,5,9350,4,0,0,Version2,AWMbdFnspxPbkMlK29Ot,0.00000,28.0
144638,,org.apache:zookeeper,2014-07-24 22:58:38,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,12.3,2.1,"1=2503,2=914,4=253,6=128,8=48,10=25,12=78","0=129,5=93,10=71,20=43,30=47,60=13,90=23",0.0,5507,316,100,6.0,4628,17,497,69,0,0,2014-07-24 22:58:38,4207,32,0,0,0,8154.0,92092.0,61300.0,"java=57547,web=884,xml=2869",37064.0,0.0,1979,2066,4628,1,0,23,23,29026.0,37064.0,0,57033,1839000,3.1,ERROR,"{""level"":""ERROR"";""conditions"":[{""metric"":""bloc...","[{""key"":""java-sonar-way-04122"";""language"":""jav...",,389,174,5,9350,4,0,0,Version2,AWMbdBpopxPbkMlK26Vg,0.00000,28.0


The dataframe has been reduced from 244 columns to only 65.

In [6]:
pd. reset_option('display.max_columns')
df.columns

Index(['COMMIT_HASH', 'PROJECT_ID', 'SQ_ANALYSIS_DATE', 'CLASSES', 'FILES',
       'FUNCTIONS', 'COMMENT_LINES', 'COMMENT_LINES_DENSITY', 'COMPLEXITY',
       'FILE_COMPLEXITY', 'CLASS_COMPLEXITY', 'FUNCTION_COMPLEXITY',
       'FUNCTION_COMPLEXITY_DISTRIBUTION', 'FILE_COMPLEXITY_DISTRIBUTION',
       'COVERAGE', 'DUPLICATED_LINES', 'DUPLICATED_BLOCKS', 'DUPLICATED_FILES',
       'DUPLICATED_LINES_DENSITY', 'VIOLATIONS', 'BLOCKER_VIOLATIONS',
       'CRITICAL_VIOLATIONS', 'INFO_VIOLATIONS', 'FALSE_POSITIVE_ISSUES',
       'CONFIRMED_ISSUES', 'LAST_COMMIT_DATE', 'CODE_SMELLS', 'BUGS',
       'EFFORT_TO_REACH_MAINTAINABILITY_RATING_A', 'AFFERENT_COUPLINGS',
       'EFFERENT_COUPLINGS', 'COGNITIVE_COMPLEXITY', 'LINES', 'NCLOC',
       'NCLOC_LANGUAGE_DISTRIBUTION', 'LINES_TO_COVER', 'LINE_COVERAGE',
       'MAJOR_VIOLATIONS', 'MINOR_VIOLATIONS', 'OPEN_ISSUES', 'SQALE_RATING',
       'NUMBER_OF_CLASSES_AND_INTERFACES', 'MISSING_PACKAGE_INFO', 'PACKAGE',
       'STATEMENTS', 'UNCOVERED_LINE

In [7]:
pd.set_option('display.max_rows', None)
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values)
pd.reset_option('display.max_rows')

Missing Values per Column:
COMMIT_HASH                                 66711
PROJECT_ID                                      0
SQ_ANALYSIS_DATE                                0
CLASSES                                        28
FILES                                          26
FUNCTIONS                                      28
COMMENT_LINES                                  26
COMMENT_LINES_DENSITY                          26
COMPLEXITY                                     28
FILE_COMPLEXITY                                28
CLASS_COMPLEXITY                               28
FUNCTION_COMPLEXITY                            28
FUNCTION_COMPLEXITY_DISTRIBUTION            10474
FILE_COMPLEXITY_DISTRIBUTION                   28
COVERAGE                                       34
DUPLICATED_LINES                                0
DUPLICATED_BLOCKS                               0
DUPLICATED_FILES                                0
DUPLICATED_LINES_DENSITY                       26
VIOLATIONS             

In [9]:
# save the updated dataset
df.to_csv(os.path.join(data_dir, 'sonar_measures_v1_v2_usable_vars.csv'), index = False)