## 01-Clean

The first step I had to do was to import some libraries and download the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "NASA"

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

DEBUG = False
SEED = 1612

In [2]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")
        d = "/content/gdrive/MyDrive/datasets"
        if not os.path.isdir(d): os.makedirs(d)
        if not os.path.isdir(ROOT): os.makedirs(ROOT)

def makedirs(d):
    if COLAB:
        if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
    else:
        if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

I then created the variable df which holds the dataframe of the csv dataset

In [3]:
target = f'{ROOT}/orig/pc2.csv'
if os.path.isfile(target):
    print('Loading Local Copy...')
else:
    print('Downloading...')
    df = pd.read_csv("https://setu-datamining2.github.io/live/topics/21-Assignments/01-NASA_Software_Defect_Datasets/files/pc2.csv")
    df.to_csv(target, index=False)

df = pd.read_csv(target)
print(df.shape)
df.head()

Loading Local Copy...
(5589, 37)


Unnamed: 0,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,DESIGN_COMPLEXITY,DESIGN_DENSITY,EDGE_COUNT,ESSENTIAL_COMPLEXITY,ESSENTIAL_DENSITY,LOC_EXECUTABLE,PARAMETER_COUNT,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,MAINTENANCE_SEVERITY,MODIFIED_CONDITION_COUNT,MULTIPLE_CONDITION_COUNT,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,defects
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,5.33,1.5,12.0,0.0,4.0,0.67,0.67,8.0,1.0,0.0,0.0,2.0,0.5,1.0,3.0,1.0,3.0,2.0,0.0,0.0,False
1,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,False
2,1.0,4.0,7.0,24.0,0.0,1.0,0.13,0.0,0.0,1.0,1.0,6.0,1.0,0.0,1.0,0.0,17.88,7.43,986.77,0.04,34.0,0.13,54.82,132.83,1.0,0.0,0.0,7.0,0.03,13.0,21.0,7.0,8.0,34.0,96.88,8.0,False
3,1.0,1.0,11.0,3.0,0.0,1.0,0.08,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,42.62,7.81,2598.31,0.11,77.0,0.13,144.35,332.79,1.0,0.0,0.0,3.0,0.06,29.0,48.0,13.0,7.0,17.0,93.33,12.0,False
4,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,3.0,33.44,0.63,13.06,0.01,9.0,1.6,0.73,20.9,1.0,0.0,0.0,3.0,0.33,5.0,4.0,4.0,1.0,3.0,0.0,1.0,False


I then decided to do some summary statistics. This means that I wanted to check whether the amount of rows and columns in the dataset is the same amount of rows and columns mentioned in the pdf document

## Summary Statistics

* Matching data in table
    * Number of cases = 5589
    * Number of features = 37

    
* All statistics are in agreement with paper

To do that I used the shape method which can show the amount of rows and columns in a dataset

As seen below, the amount of rows and columns in the dataset match the amount of rows and features mentioned in the pdf doc

In [4]:
print('number of cases: ', df.shape[0])
print('number of features: ', df.shape[1])

number of cases:  5589
number of features:  37


I then created a small dataframe that indicates shows the expected number of rows and columns against the observed number of rows and columns in a more friendly way

As seen below, the number of expected and observed rows and columns is the same and therefore the result is PASS for both of them

In [5]:
table_case_count = lambda df: df.shape[0]
table_feature_count = lambda df: df.shape[1]
messages = []
for m, expected, observed in [
    ('Number of cases', 5589, table_case_count(df)),
    ('Number of features', 37, table_feature_count(df))
]:
    result = '' if expected == observed else 'FAIL'
    messages.append(f'* {m} {expected=} {observed=} {result}')
    
display(Markdown('\n'.join(messages)))

* Number of cases expected=5589 observed=5589 
* Number of features expected=37 observed=37 

In [6]:
messages = []
for m, expected, observed in [
    ('Number of cases', 5589, table_case_count(df)),
    ('Number of features', 37, table_feature_count(df))
]:
    result = 'PASS' if expected == observed else 'FAIL'
    messages.append([m, expected, observed, result])
    
df_results = pd.DataFrame(messages, columns=['Discription', 'Expected', 'Observed', 'Result'])
df_results.head()

Unnamed: 0,Discription,Expected,Observed,Result
0,Number of cases,5589,5589,PASS
1,Number of features,37,37,PASS


I then created the variable features_problems holds the count of features impacted by 1 or more of A-E from the pdf doc. Since features may contain more than one problem this need not be the sum of A to E

In [7]:
features_problems = 0

## Identical Features (Checking)

The first thing I wanted to check was identical features, to do that I created a lambda function that return the sum of identical features in a dataframe and the output is stored in a variable called table_2_a

In [8]:
table_2_a = lambda df: df.T.duplicated().sum()

I then created a small data frame that shows the expected number of identical features against the observed number of identical features. As seen below they are both equal so the result is PASS

In [9]:
messages = []
for m, expected, observed in [
    ('Identical features', 0, table_2_a(df))
]:
    messages.append([m, expected, observed, result])
    
df_results = pd.DataFrame(messages, columns = ['Description', 'Expected', 'Observed', 'Result'])
df_results

Unnamed: 0,Description,Expected,Observed,Result
0,Identical features,0,0,PASS


## Features with missing values (Checking)

I then checked for the number of features that have missing values

The expected number of features that have missing values is 0

To check the features that have missing values in the dataset, I used the function isnull and the function sum to show how many missing values every column have

As seen below there are no missing values in any column

In [10]:
df.isnull().sum()

BRANCH_COUNT                       0
CALL_PAIRS                         0
LOC_CODE_AND_COMMENT               0
LOC_COMMENTS                       0
CONDITION_COUNT                    0
CYCLOMATIC_COMPLEXITY              0
CYCLOMATIC_DENSITY                 0
DECISION_COUNT                     0
DECISION_DENSITY                   0
DESIGN_COMPLEXITY                  0
DESIGN_DENSITY                     0
EDGE_COUNT                         0
ESSENTIAL_COMPLEXITY               0
ESSENTIAL_DENSITY                  0
LOC_EXECUTABLE                     0
PARAMETER_COUNT                    0
HALSTEAD_CONTENT                   0
HALSTEAD_DIFFICULTY                0
HALSTEAD_EFFORT                    0
HALSTEAD_ERROR_EST                 0
HALSTEAD_LENGTH                    0
HALSTEAD_LEVEL                     0
HALSTEAD_PROG_TIME                 0
HALSTEAD_VOLUME                    0
MAINTENANCE_SEVERITY               0
MODIFIED_CONDITION_COUNT           0
MULTIPLE_CONDITION_COUNT           0
N

I also did the same check using the isna function to make sure of the previous result and I got the same

In [11]:
df.isna().sum()

BRANCH_COUNT                       0
CALL_PAIRS                         0
LOC_CODE_AND_COMMENT               0
LOC_COMMENTS                       0
CONDITION_COUNT                    0
CYCLOMATIC_COMPLEXITY              0
CYCLOMATIC_DENSITY                 0
DECISION_COUNT                     0
DECISION_DENSITY                   0
DESIGN_COMPLEXITY                  0
DESIGN_DENSITY                     0
EDGE_COUNT                         0
ESSENTIAL_COMPLEXITY               0
ESSENTIAL_DENSITY                  0
LOC_EXECUTABLE                     0
PARAMETER_COUNT                    0
HALSTEAD_CONTENT                   0
HALSTEAD_DIFFICULTY                0
HALSTEAD_EFFORT                    0
HALSTEAD_ERROR_EST                 0
HALSTEAD_LENGTH                    0
HALSTEAD_LEVEL                     0
HALSTEAD_PROG_TIME                 0
HALSTEAD_VOLUME                    0
MAINTENANCE_SEVERITY               0
MODIFIED_CONDITION_COUNT           0
MULTIPLE_CONDITION_COUNT           0
N

This means that the check for features with missing values gets the result of PASS

## Constant Features (Checking)

I then decided to check for constant features

The expeceted number of constant features 0

To do  this check I created an empty list called feat

I then created a for loop that goes through the names of the columns of the dataset

The for loop then checks every column in the dataset whether it has a value count of 1

This means that it checks whether a column contains only 1 value

If a column contains 1 value then the string 'Constant' is appended to the feat list and if not then the string 'Not Constant' is appended to the feat list

I then print the result of casting the list feat to a set so I can see the unique values of it

As seen below the output shows that the set of feat contains only one value which is 'Not Constant' and therefore it means that there are no constant features in the dataset

This means that the result for this check is PASS

In [12]:
feat = []
for i in df.columns:
    if df[i].value_counts == 1:
        feat.append('Constant')
    else:
        feat.append('Not Constant')

print(set(feat))

{'Not Constant'}


## Implausible Features (Checking)

I then decided to check for implausible features

The expected number of implausible features is 1

The first thing I wanted to check is any features that contain values less than 0

To do that I used a condition and counted the number of matches related to each column for that condition

As seen below, no features contained any values less than 0

In [13]:
# https://stackoverflow.com/questions/23833763/count-number-of-elements-in-each-column-less-than-x

# Checking for values less than 0

df[df < 0].count()

BRANCH_COUNT                       0
CALL_PAIRS                         0
LOC_CODE_AND_COMMENT               0
LOC_COMMENTS                       0
CONDITION_COUNT                    0
CYCLOMATIC_COMPLEXITY              0
CYCLOMATIC_DENSITY                 0
DECISION_COUNT                     0
DECISION_DENSITY                   0
DESIGN_COMPLEXITY                  0
DESIGN_DENSITY                     0
EDGE_COUNT                         0
ESSENTIAL_COMPLEXITY               0
ESSENTIAL_DENSITY                  0
LOC_EXECUTABLE                     0
PARAMETER_COUNT                    0
HALSTEAD_CONTENT                   0
HALSTEAD_DIFFICULTY                0
HALSTEAD_EFFORT                    0
HALSTEAD_ERROR_EST                 0
HALSTEAD_LENGTH                    0
HALSTEAD_LEVEL                     0
HALSTEAD_PROG_TIME                 0
HALSTEAD_VOLUME                    0
MAINTENANCE_SEVERITY               0
MODIFIED_CONDITION_COUNT           0
MULTIPLE_CONDITION_COUNT           0
N

I then decided to check for any features that have names containing '_COUNT' in them and they contain any non integer values

To do that I created an empty dataframe called df2

I then created a for loop that goes through every column name in the original dataframe (df) and adds that column to the df2 dataframe if it contains '_COUNT' in its name

I then created a condition that checks the newly added columns in the df2 dataframe for non integer values and counting the counted the values of True and False produced for every column after checking this condition

As seen below all the columns had False values only

In [14]:
# https://stackoverflow.com/questions/49249860/how-to-check-if-float-pandas-column-contains-only-integer-numbers

# Checking for non integer values

df2 = pd.DataFrame()
for i in df.columns:
    if '_COUNT' in i:
        df2[i] = df[i]

(df2 % 1 != 0).value_counts()

BRANCH_COUNT  CONDITION_COUNT  DECISION_COUNT  EDGE_COUNT  PARAMETER_COUNT  MODIFIED_CONDITION_COUNT  MULTIPLE_CONDITION_COUNT  NODE_COUNT
False         False            False           False       False            False                     False                     False         5589
dtype: int64

I then checked whether the column 'LOC_TOTAL' contains any values that are equal to 0

To do that I used a condition that checks the values that equal to 0 in that column and sum the number of values that matches this condition

As seen below the column does contain values that are equal to 0, this means that this column or feaeture is implausible

Since this is the only implausible feature I encountered, this means that the number of implausible features I found is 1 which is the same number of expected implausible features

This means that the result for this check is PASS

In [15]:
# Checking for LOC_TOTAL values equal to 0

(df['LOC_TOTAL'] == 0).sum()

1084

I also incremented the variable features_problems by 1 because we encountered 1 feature with a problem so far which is the implausible feature mentioned earlier

In [16]:
features_problems += 1

## Implausible Cases (Checking):

I then started checking the cases

The first check I carried out was the implausible cases check

I used the same code I used to check for implausible features and got the same results

In [17]:
# https://stackoverflow.com/questions/23833763/count-number-of-elements-in-each-column-less-than-x

# Checking for values less than 0

df[df < 0].count()

BRANCH_COUNT                       0
CALL_PAIRS                         0
LOC_CODE_AND_COMMENT               0
LOC_COMMENTS                       0
CONDITION_COUNT                    0
CYCLOMATIC_COMPLEXITY              0
CYCLOMATIC_DENSITY                 0
DECISION_COUNT                     0
DECISION_DENSITY                   0
DESIGN_COMPLEXITY                  0
DESIGN_DENSITY                     0
EDGE_COUNT                         0
ESSENTIAL_COMPLEXITY               0
ESSENTIAL_DENSITY                  0
LOC_EXECUTABLE                     0
PARAMETER_COUNT                    0
HALSTEAD_CONTENT                   0
HALSTEAD_DIFFICULTY                0
HALSTEAD_EFFORT                    0
HALSTEAD_ERROR_EST                 0
HALSTEAD_LENGTH                    0
HALSTEAD_LEVEL                     0
HALSTEAD_PROG_TIME                 0
HALSTEAD_VOLUME                    0
MAINTENANCE_SEVERITY               0
MODIFIED_CONDITION_COUNT           0
MULTIPLE_CONDITION_COUNT           0
N

In [18]:
# https://stackoverflow.com/questions/49249860/how-to-check-if-float-pandas-column-contains-only-integer-numbers

# Checking for non integer values

df3 = pd.DataFrame()
for i in df.columns:
    if '_COUNT' in i:
        df3[i] = df[i]

(df3 % 1 != 0).value_counts()

BRANCH_COUNT  CONDITION_COUNT  DECISION_COUNT  EDGE_COUNT  PARAMETER_COUNT  MODIFIED_CONDITION_COUNT  MULTIPLE_CONDITION_COUNT  NODE_COUNT
False         False            False           False       False            False                     False                     False         5589
dtype: int64

In [19]:
# Checking for LOC_TOTAL values equal to 0

(df['LOC_TOTAL'] == 0).sum()

1084

As seen above, the amount of implausible cases is 1084

This is the same number of implausible cases mentioned in the pdf doc

This means that the result of this check is PASS

## Identical Cases (Checking)

I then checked for the number of identical cases

To do that I used the duplicated function and the value_counts function to count the True values generated by the duplicated function

The duplicated cases number were 4183 as seen below which is different from the number mentioned in the pdf doc which is 4621 so the result of this check is NOT PASS

In [20]:
df.duplicated().value_counts()

True     4183
False    1406
dtype: int64

## Inconsistent Cases (Checking)

I then decided to check for the inconsistent cases

To do that I created an empty dataframe with two columns: 'Other features' and 'Target'

The 'Other features' column contains True or False values that refer to whether the values of all the features in the row except for the defects feature  are duplicated

The 'Target' column contains True or False values that refer to whether the values of the target feature only (defects) in a row areduplicated

To do the check, I created a condition that checks any rows that contains True in the 'Other features' column and False in the 'Target' column

This basically means that I am checking for the rows that all their features are duplicated except for the target feature.

I then used the sum function as well to check for the number of cases that meet this condition

As seen below, no cases meet this condition so the number of inconistent cases observed is 0 which is different from the number of inconsistent cases mentioned in the pdf doc which is 106 so the result for this check is NOT PASS

In [21]:
cols = df.columns

df2 = pd.DataFrame()
df2['Other features'] = df.duplicated(subset=cols[:-1])
df2['Target'] = df.duplicated(subset=cols[-1])

df2.head()

Unnamed: 0,Other features,Target
0,False,False
1,False,True
2,False,True
3,False,True
4,False,True


In [22]:
df2[(df2['Other features'] == True) & (df2['Target'] == False)]

Unnamed: 0,Other features,Target


In [23]:
df2[(df2['Other features'] == True) & (df2['Target'] == False)].sum()

Other features    0.0
Target            0.0
dtype: float64

## Cases with missing values (Checking)

I then checked the cases that contain missing values (N/A values)

To do that, I used the same code I used to check for the features with missing values

As seen below, the number of cases with missing values is 0 which is the same number mentioned in the pdf doc so the result of this check is PASS

In [24]:
# https://www.geeksforgeeks.org/count-nan-or-missing-values-in-pandas-dataframe/

df.isnull().sum()

BRANCH_COUNT                       0
CALL_PAIRS                         0
LOC_CODE_AND_COMMENT               0
LOC_COMMENTS                       0
CONDITION_COUNT                    0
CYCLOMATIC_COMPLEXITY              0
CYCLOMATIC_DENSITY                 0
DECISION_COUNT                     0
DECISION_DENSITY                   0
DESIGN_COMPLEXITY                  0
DESIGN_DENSITY                     0
EDGE_COUNT                         0
ESSENTIAL_COMPLEXITY               0
ESSENTIAL_DENSITY                  0
LOC_EXECUTABLE                     0
PARAMETER_COUNT                    0
HALSTEAD_CONTENT                   0
HALSTEAD_DIFFICULTY                0
HALSTEAD_EFFORT                    0
HALSTEAD_ERROR_EST                 0
HALSTEAD_LENGTH                    0
HALSTEAD_LEVEL                     0
HALSTEAD_PROG_TIME                 0
HALSTEAD_VOLUME                    0
MAINTENANCE_SEVERITY               0
MODIFIED_CONDITION_COUNT           0
MULTIPLE_CONDITION_COUNT           0
N

In [25]:
df.isna().sum()

BRANCH_COUNT                       0
CALL_PAIRS                         0
LOC_CODE_AND_COMMENT               0
LOC_COMMENTS                       0
CONDITION_COUNT                    0
CYCLOMATIC_COMPLEXITY              0
CYCLOMATIC_DENSITY                 0
DECISION_COUNT                     0
DECISION_DENSITY                   0
DESIGN_COMPLEXITY                  0
DESIGN_DENSITY                     0
EDGE_COUNT                         0
ESSENTIAL_COMPLEXITY               0
ESSENTIAL_DENSITY                  0
LOC_EXECUTABLE                     0
PARAMETER_COUNT                    0
HALSTEAD_CONTENT                   0
HALSTEAD_DIFFICULTY                0
HALSTEAD_EFFORT                    0
HALSTEAD_ERROR_EST                 0
HALSTEAD_LENGTH                    0
HALSTEAD_LEVEL                     0
HALSTEAD_PROG_TIME                 0
HALSTEAD_VOLUME                    0
MAINTENANCE_SEVERITY               0
MODIFIED_CONDITION_COUNT           0
MULTIPLE_CONDITION_COUNT           0
N

## Conflict Cases Values (Referential integrity checks)  (Checking)

I then decided to check for the cases with conflicting feature values

To do that I ran through the integrity checks and created conditions that would check for what the integrity checks are looking for in the dataframe and I used the sum function to get the total number of cases that pass each integrity check

If the result of the check is 5589 then there are no cases with conflicting feature values in regards of this particular integrity check and the integrity check has passed but if the result is less than 5589 then there might be some cases with conflicting feature values in regards of this integrity check

For Check 1, all cases passed the integrity check

In [26]:
# Check 1:

(df['NUMBER_OF_LINES'] >= df['LOC_TOTAL']).sum()

5589

For Check 2, it was not possible to carry it out since there was no column in the dataset called LOC_BLANK

In [27]:
df.columns

Index(['BRANCH_COUNT', 'CALL_PAIRS', 'LOC_CODE_AND_COMMENT', 'LOC_COMMENTS',
       'CONDITION_COUNT', 'CYCLOMATIC_COMPLEXITY', 'CYCLOMATIC_DENSITY',
       'DECISION_COUNT', 'DECISION_DENSITY', 'DESIGN_COMPLEXITY',
       'DESIGN_DENSITY', 'EDGE_COUNT', 'ESSENTIAL_COMPLEXITY',
       'ESSENTIAL_DENSITY', 'LOC_EXECUTABLE', 'PARAMETER_COUNT',
       'HALSTEAD_CONTENT', 'HALSTEAD_DIFFICULTY', 'HALSTEAD_EFFORT',
       'HALSTEAD_ERROR_EST', 'HALSTEAD_LENGTH', 'HALSTEAD_LEVEL',
       'HALSTEAD_PROG_TIME', 'HALSTEAD_VOLUME', 'MAINTENANCE_SEVERITY',
       'MODIFIED_CONDITION_COUNT', 'MULTIPLE_CONDITION_COUNT', 'NODE_COUNT',
       'NORMALIZED_CYLOMATIC_COMPLEXITY', 'NUM_OPERANDS', 'NUM_OPERATORS',
       'NUM_UNIQUE_OPERANDS', 'NUM_UNIQUE_OPERATORS', 'NUMBER_OF_LINES',
       'PERCENT_COMMENTS', 'LOC_TOTAL', 'defects'],
      dtype='object')

For Check 3, all cases passed the integrity check

In [28]:
# Check 3:

(df['NUMBER_OF_LINES'] >= df['LOC_CODE_AND_COMMENT']).sum()

5589

For Check 4, all cases passed the integrity check

In [29]:
# Check 4:

(df['NUMBER_OF_LINES'] >= df['LOC_COMMENTS']).sum()

5589

For Check 5, all cases passed the integrity check

In [30]:
# Check 5:

(df['NUMBER_OF_LINES'] >= df['LOC_EXECUTABLE']).sum()

5589

For Check 6, all cases passed the integrity check

In [31]:
# Check 6:

(df['LOC_TOTAL'] >= df['LOC_EXECUTABLE']).sum()

5589

For Check 7, all cases passed the integrity check

In [32]:
# Check 7:

(df['LOC_TOTAL'] >= df['LOC_CODE_AND_COMMENT']).sum()

5589

For Check 8, all cases passed the integrity check

In [33]:
# Check 8:

(df['NUM_OPERANDS'] >= df['NUM_UNIQUE_OPERANDS']).sum()

5589

For Check 9, all cases passed the integrity check

In [34]:
# Check 9:

(df['NUM_OPERATORS'] >= df['NUM_UNIQUE_OPERATORS']).sum()

5589

For Check 10, all cases passed the integrity check

In [35]:
# Check 10:

(df['HALSTEAD_LENGTH'] == (df['NUM_OPERATORS'] + df['NUM_OPERANDS'])).sum()

5589

For Check 11, all cases passed the integrity check

In [36]:
# Check 11:

(df['CYCLOMATIC_COMPLEXITY'] <= (df['NUM_OPERATORS']+1)).sum()

5589

For Check 12, the number returned was less than 5589 which meant that there are cases with conflicting feature values

To figure out their amount I subtracted the returned number from 5589 and got 129

This meant that there are 129 cases with conflicting feature values in relation to this particular integrity check

In [37]:
# Check 12:

(df['CALL_PAIRS'] <= df['NUM_OPERATORS']).sum()

5460

In [38]:
5589 - (df['CALL_PAIRS'] <= df['NUM_OPERATORS']).sum()

129

The rest of the checks were more complicated because they contained log2 and division which produced numbers with decimal places

This was problematic because it was preducing inacurate check results

To counter that issue I had to do 4 steps:
    
1. I first created an empty list called listy wwhich will hold the result of carrying out the integrity check on each case, in this scenario the results of the checks can be 'Equals' or 'Not equals'

2. I first checked whether the number in the log2 function or the number that is being devided on is not equal to 0, if the number is not equal to 0 then the rest of check will commence, if it is equal to 0 then I check if the expected number that we are checking for is equal to 0, if yes then we append Equals to listy, if no then we append No equals to listy

3. If the number in the log2 function or the number we are dividing by is not 0 then we check whether the observed number is in the range of the expected number -1 and +1 (is the observed number  greater or equal to the expceted number minus 1 and is it less than or equal to the expected number plus 1)

4. I then convert print listy as a set to check the unique values of it, if it only contains Equals then all the cases passed the integrity check, if it contains Equals and Not equals then there are cases that didn't pass the integrity check, If there are such cases then I check how many are they

To do all these checks I had to use a for loop that goes through every row by index and does these checks on it

For Check 13, all cases passed the integrity check

In [39]:
# Check 13

listy = []
for i in range(5589):
    if df['NUM_UNIQUE_OPERATORS'][i] != 0 or df['NUM_UNIQUE_OPERANDS'][i] != 0:
        if df['HALSTEAD_VOLUME'][i] >= (df['NUM_OPERATORS'][i] + df['NUM_OPERANDS'][i]) * (math.log2(df['NUM_UNIQUE_OPERATORS'][i] + df['NUM_UNIQUE_OPERANDS'][i]))-1 and df['HALSTEAD_VOLUME'][i] <= (df['NUM_OPERATORS'][i] + df['NUM_OPERANDS'][i]) * (math.log2(df['NUM_UNIQUE_OPERATORS'][i] + df['NUM_UNIQUE_OPERANDS'][i]))+1:
            listy.append('Equals')
        else:
            listy.append('Not equals')
    else:
        if df['HALSTEAD_VOLUME'][i] == 0:
            listy.append('Equals')
        else:
            listy.append('Not equals')

print(set(listy))

{'Equals'}


For Check 14, all cases passed the integrity check

In [40]:
# Check 14

listy = []
for i in range(5589):
    if df['NUM_UNIQUE_OPERATORS'][i] != 0 and df['NUM_OPERANDS'][i] != 0:
        if df['HALSTEAD_LEVEL'][i] >= ((2 / df['NUM_UNIQUE_OPERATORS'][i]) * (df['NUM_UNIQUE_OPERANDS'][i] / df['NUM_OPERANDS'][i]))-1 and df['HALSTEAD_LEVEL'][i] <= ((2 / df['NUM_UNIQUE_OPERATORS'][i]) * (df['NUM_UNIQUE_OPERANDS'][i] / df['NUM_OPERANDS'][i]))+1:
            listy.append('Equals')
        else:
            listy.append('Not equals')
    else:
        if df['HALSTEAD_LEVEL'][i] == 0:
            listy.append('Equals')
        else:
            listy.append('Not equals')

print(set(listy))

{'Equals'}


For Check 15, all cases passed the integrity check

In [41]:
# Check 15

listy = []
for i in range(5589):
    if df['NUM_UNIQUE_OPERANDS'][i] != 0:
        if df['HALSTEAD_DIFFICULTY'][i] >= ((df['NUM_UNIQUE_OPERATORS'][i] / 2) * (df['NUM_OPERANDS'][i] / df['NUM_UNIQUE_OPERANDS'][i]))-1 and df['HALSTEAD_DIFFICULTY'][i] <= ((df['NUM_UNIQUE_OPERATORS'][i] / 2) * (df['NUM_OPERANDS'][i] / df['NUM_UNIQUE_OPERANDS'][i]))+1:
            listy.append('Equals')
        else:
            listy.append('Not equals')
    else:
        if df['HALSTEAD_DIFFICULTY'][i] == 0:
            listy.append('Equals')
        else:
            listy.append('Not equals')

print(set(listy))

{'Equals'}


For Check 16, all cases passed the integrity check

In [42]:
# Check 16

listy = []
for i in range(5589):
    if df['HALSTEAD_DIFFICULTY'][i] != 0:
        if df['HALSTEAD_CONTENT'][i] >= (df['HALSTEAD_VOLUME'][i] / df['HALSTEAD_DIFFICULTY'][i])-1 and df['HALSTEAD_CONTENT'][i] <= (df['HALSTEAD_VOLUME'][i] / df['HALSTEAD_DIFFICULTY'][i])+1:
            listy.append('Equals')
        else:
            listy.append('Not equals')
    else:
        if df['HALSTEAD_CONTENT'][i] == 0:
            listy.append('Equals')
        else:
            listy.append('Not equals')

print(set(listy))

{'Equals'}


For Check 17, I got 2 unique values which are Equals and Not equals. So I decided to check for how many 'Not equals' values are in listy and I found out that there are 222 of them

In [43]:
# Check 17

listy = []
for i in range(5589):
    if df['HALSTEAD_EFFORT'][i] >= (df['HALSTEAD_VOLUME'][i] * df['HALSTEAD_DIFFICULTY'][i])-1 and df['HALSTEAD_EFFORT'][i] <= (df['HALSTEAD_VOLUME'][i] * df['HALSTEAD_DIFFICULTY'][i])+1:
        listy.append('Equals')
    else:
        listy.append('Not equals')
        
print(set(listy))
print(listy.count('Not equals'))

{'Equals', 'Not equals'}
222


For Check 18, all cases passed the integrity check

In [44]:
# Check 18

listy = []
for i in range(5589):
    if df['HALSTEAD_PROG_TIME'][i] >= (df['HALSTEAD_EFFORT'][i] / 18) -1 and df['HALSTEAD_PROG_TIME'][i] <= (df['HALSTEAD_EFFORT'][i] / 18) +1:
        listy.append('Equals')
    else:
        listy.append('Not equals')
        
print(set(listy))

{'Equals'}


The total number of cases with conflicting feature values is 129 + 222 which equals to 351

This doesn't match the number mentioned in the pdf doc which is 129 so the result of this test is NOT PASS

## Features with conflicting values (Checking)

For this test, I didn't write code but instead I got the names of the features with conflicting values from the previous section

I then incremented the variable features_problems by 2 because this was the amount of features with conflict values that I discovered in the previous section which matches the number mentioned in the pdf doc so the result for this test is PASS

In [45]:
# From the Conflict Cases Values section

# 'HALSTEAD_EFFORT'
# 'CALL_PAIRS'

features_problems += 2

## Total Cases and Features Problems (Checking)

To check for the total problem features, I just printed the variable features_problems which shows that the total problem features is 3 which is the same as the number mentioned in the pdf doc so the result of this test is PASS

In [46]:
print(features_problems)

3


To check for the cases that data quality problems and total problem cases I created the following variables

- cases_data_quality_problems: Count of cases impacted by one or more of I to K that we denote DS′. Since cases may contain more than one problem this need not be the sum of I to K.
- cases_problems: Count of cases impacted by one or more of G to K denoted DS′′

For the total problem cases, I created a dataframe called df2

I then created a column in it called Duplicated which holds True or False values related to duplicated rows in the original dataframe (df)

I then created a column called Implausible which holds True or False values related to rows with implausible values

I then created a column called Conflict_Check_10 which holds True or False values related to rows with conflict values in relation to check 10 because there were cases with conflicting values related to that particular check

I then created a column called Conflict_Check_17 which holds True or False values related to rows with conflict values in relation to check 17 because there were cases with conflicting values related to that particular check

I then created a for loop that goes through the rows of df2 and checks if it contains any True values, if yes then the variable cases_problems is incremented by 1

In the end I received a result of 4293 as seen below which doesn't match the result in the pdf doc which is 4297 so the result of this check is NOT PASS

In [47]:
cases_problems = 0

df2 = pd.DataFrame()

df2['Duplicated'] = df.duplicated()

df2['Implausible'] = (df['LOC_TOTAL'] == 0)

df2['Conflict_Check_12'] = (df['CALL_PAIRS'] > df['NUM_OPERATORS'])

df2['Conflict_Check_17'] = ((df['HALSTEAD_EFFORT'] < (df['HALSTEAD_VOLUME'] * df['HALSTEAD_DIFFICULTY'])-1) & (df['HALSTEAD_EFFORT'] > (df['HALSTEAD_VOLUME'] * df['HALSTEAD_DIFFICULTY'])+1))

#cases_problems = 0

for i in range(len(df2)):
    if (df2.iloc[i] == True).any():
        cases_problems += 1


print(cases_problems)

4293


For checking data quality problems I used the same method as before but without checking for duplicated cases

In the end, I got a result of 1163 which matches the result in the pdf doc

This means that the result of this test is PASS

In [48]:
cases_data_quality_problems = 0

df3 = pd.DataFrame()

df3['Implausible'] = df2['Implausible']

df3['Conflict_Check_12'] = df2['Conflict_Check_12']

df3['Conflict_Check_17'] = df2['Conflict_Check_17']

for i in range(len(df3)):
    if (df3.iloc[i] == True).any():
        cases_data_quality_problems += 1

print(cases_data_quality_problems)

1163


I then printed the values of the variables features_problems, cases_data_quality_problems and cases_problems above each other to summerize the findings of this section

In [49]:
print(f'features_problems: {features_problems}')
print(f'cases_data_quality_problems: {cases_data_quality_problems}')
print(f'cases_problems: {cases_problems}')

features_problems: 3
cases_data_quality_problems: 1163
cases_problems: 4293


## Step 1: Implausible Cases (Deletion)

I then started the process of cleaning the dataset

For step 1, I had to drop all implausible cases

The only implausible cases I found before where the cases in the column LOC_TOTAL which were equal to 0, so I dropped all the cases that matched this condition using the drop method, I then counted the amount of cases left in the column after the drop process and it was 4505

In [50]:
# https://thispointer.com/count-number-of-zeros-in-pandas-dataframe-column/

df.drop(df[df['LOC_TOTAL'] == 0].index, inplace = True)

In [51]:
df['LOC_TOTAL'].count()

4505

## Step 2: Conflict Cases (Referential integrity checks) (Deletion)

In [52]:
len(df)

4505

For step 2, I had to drop all cases with conflicting values, to do that I dropped the cases that didn't match check 12 and check 17 because these were the only checks that not all cases passed

After the drop process, there were 4204 rows left in the dataset

In [53]:
# https://sparkbyexamples.com/pandas/pandas-drop-rows-with-condition/#:~:text=Use%20pandas.,rows%20with%20condition(s).

# Cond 12

df.drop(df[df['CALL_PAIRS'] > df['NUM_OPERATORS']].index, inplace = True)
len(df)

4426

In [54]:
# Cond 17
cond = (df['HALSTEAD_EFFORT'] < (df['HALSTEAD_VOLUME'] * df['HALSTEAD_DIFFICULTY'])-1) | (df['HALSTEAD_EFFORT'] > (df['HALSTEAD_VOLUME'] * df['HALSTEAD_DIFFICULTY'])+1)
df.drop(df[cond].index, inplace = True)
len(df)

4204

## Step 3: Identical Cases (Deletion)

For step 3, I had to drop all duplicated cases

I did so using the function drop_duplicates

I then counted the amount of cases with duplicated values and there were no cases with duplicated values left

There were also 1129 cases left in the dataset in total

In [55]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html
# https://stackoverflow.com/questions/46489695/drop-duplicates-not-working-in-pandas

df.drop_duplicates(subset=None, keep="first", inplace=True)

df.duplicated().value_counts()

False    1129
dtype: int64

## Step 4: Inconsistant Cases (Deletion)

For steps 4, 5, 6 and 7, there were no features or cases to drop because they all passed the checks related to these steps

In [56]:
# No Inconsistant Cases

## Step 5: Cases With Missing Values (Deletion)

In [57]:
# No Cases with Missing values

## Step 6: Constant Features (Deletion)

In [58]:
# No Constant Features

## Step 7: Identical Features (Deletion)

In [59]:
# No Identical Features

I then outputted the cleaned dataset as pc2.csv in the /data directory

In [60]:
df.to_csv(f"{ROOT}/data/pc2.csv")

In [61]:
df.head()

Unnamed: 0,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,DESIGN_COMPLEXITY,DESIGN_DENSITY,EDGE_COUNT,ESSENTIAL_COMPLEXITY,ESSENTIAL_DENSITY,LOC_EXECUTABLE,PARAMETER_COUNT,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,MAINTENANCE_SEVERITY,MODIFIED_CONDITION_COUNT,MULTIPLE_CONDITION_COUNT,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,defects
2,1.0,4.0,7.0,24.0,0.0,1.0,0.13,0.0,0.0,1.0,1.0,6.0,1.0,0.0,1.0,0.0,17.88,7.43,986.77,0.04,34.0,0.13,54.82,132.83,1.0,0.0,0.0,7.0,0.03,13.0,21.0,7.0,8.0,34.0,96.88,8.0,False
3,1.0,1.0,11.0,3.0,0.0,1.0,0.08,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,42.62,7.81,2598.31,0.11,77.0,0.13,144.35,332.79,1.0,0.0,0.0,3.0,0.06,29.0,48.0,13.0,7.0,17.0,93.33,12.0,False
4,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,3.0,33.44,0.63,13.06,0.01,9.0,1.6,0.73,20.9,1.0,0.0,0.0,3.0,0.33,5.0,4.0,4.0,1.0,3.0,0.0,1.0,False
6,1.0,0.0,1.0,0.0,0.0,1.0,0.5,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,10.87,3.5,133.19,0.01,11.0,0.29,7.4,38.05,1.0,0.0,0.0,2.0,0.33,4.0,7.0,4.0,7.0,3.0,50.0,2.0,False
7,3.0,1.0,1.0,0.0,6.0,2.0,1.0,2.0,3.0,1.0,0.5,5.0,1.0,0.0,1.0,1.0,11.07,4.38,211.89,0.02,14.0,0.23,11.77,48.43,0.5,2.0,3.0,5.0,0.5,5.0,9.0,4.0,7.0,4.0,50.0,2.0,False
