# Feature generator

Halstead Volume

In [1]:
import re
from math import log2

def halstead_volume(file_path):
    with open(file_path, 'r') as file:
        code = file.read()

    # Remove comments from the code
    code = re.sub(r'//.*', '', code) # Remove single-line comments
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) # Remove multi-line comments

    # Split the code into lines and filter out lines that do not contain any operators
    lines = [line for line in code.split('\n') if any(op in line for op in '+-*/%=><&|^!~')]

    # Count the number of unique operators and operands in the code
    operators = set(re.findall(r'[\+\-\*/%=><&\|\^!~]+', code))
    operands = set(re.findall(r'[a-zA-Z]\w*', code))

    # Calculate the total number of operators and operands in the code
    total_operators = sum(code.count(op) for op in operators)
    total_operands = sum(code.count(op) for op in operands)

    # Calculate the program vocabulary (total number of unique operators and operands)
    program_vocab = len(operators) + len(operands)

    # Calculate the program length (total number of operators and operands)
    program_length = total_operators + total_operands

    # Calculate the program volume using the Halstead volume formula
    program_volume = program_length * log2(program_vocab)

    return program_volume

Number of lines, number of blank lines, number of characters

In [2]:
def count_source_lines_chars(file_path):
    # Open the file for reading
    with open(file_path, 'r') as f:
        # Initialize a counter variable to keep track of the number of source code lines
        count = 0
        count1 = 0
        # keep count of end blank lines
        count2 = 0
        # check if this is the first non-blank line
        firstNonBlank = False
        chars = 0


        # Loop through each line in the file
        for line in f:
            # Check if the line is blank line
            if line.strip() == '':
                if firstNonBlank:
                    count1 += 1 
                    count2 += 1 
                continue
            # If the line is not a blank line, increment the counter
            else:
                count += 1
                chars += len(line)
                count2 = 0
                firstNonBlank = True

        # Return the number of source code lines
        return count, count1 - count2, chars

(16, 0, 504)

Number of comments and Ratio of comments to total number of lines

In [3]:
def calculate_comment(file_path):
    # Initialize counters for total lines and comment lines
    total_lines = 0
    comment_lines = 0

    # Open the file for reading
    with open(file_path, "r") as file:
        # Iterate through each line in the file
        for line in file:
            # Strip leading and trailing whitespaces from the line
            line = line.strip()
            # Increment the total lines counter
            total_lines += 1

             # If the line starts with "//", it is a single line comment
            if line.startswith("//"):
                comment_lines += 1

            # If the line starts with "/*", it is the beginning of a multi-line comment
            elif line.startswith("/*"):
                comment_lines += 1
                 # Keep reading lines until the end of the multi-line comment "*/" is reached
                while "*/" not in line:
                    line = next(file).strip()
                    total_lines += 1
                    comment_lines += 1

            # If the line contains "/*" but not "*/", it is the middle of a multi-line comment
            elif "/*" in line and "*/" not in line:
                comment_lines += 1
                # Keep reading lines until the end of the multi-line comment "*/" is reached
                while "*/" not in line:
                    line = next(file).strip()
                    total_lines += 1
                    comment_lines += 1
                # Increment the comment lines counter to include the end of the multi-line comment "*/"
                comment_lines += 1
                
    # Return both the total number of comment lines and the average number of comments per line
    return comment_lines, comment_lines/total_lines

(2, 0.125)

Code readability (FK and GF)

In [4]:
import re

# Function to estimate the number of syllables in a given word
def count_syllables(word):
    vowels = 'aeiouy'
    count = 0
    prev_char_vowel = False

    # Loop through each character in the word
    for char in word.lower():
        if char in vowels:
            if not prev_char_vowel:
                count += 1
            prev_char_vowel = True
        else:
            prev_char_vowel = False
    
    # If the word ends with 'e', subtract one from the syllable count
    if word.lower().endswith('e'):
        count -= 1
    # If the word has no vowels, set the count to 1
    if count == 0:
        count = 1

    # Return the number of syllables in the word
    return count


# Function to calculate the readability of a text
def readability(text):
    # Split the text into words and sentences
    words = re.findall(r'\b\w+\b', text)
    sentences = re.findall(r'[^.!?]+[.!?]', text)
    sentences = len(sentences)

    # Create a list to store complex words (words with three or more syllables)
    complex_words = []
    # Calculate the total number of syllables in the text
    totalSyllables = 0
    
    # If there are no sentences, set the number of sentences to 1 to avoid division by zero
    if sentences == 0:
        sentences = 1

    # Loop through each word in the text and count the number of syllables
    for word in words:
        syllables = count_syllables(word)
        totalSyllables += syllables

        # If the word has three or more syllables, add it to the list of complex words
        if (syllables >= 3):
            complex_words.append(word)
    
    # Count the number of complex words in the text
    num_complex_words = len(complex_words)

    # Calculate the Gunning Fog score
    gunningFogScore = 0.4 * (len(words) / sentences + 100 * (num_complex_words / len(words)))
    # Calculate the Flesch-Kincaid score
    fleschKincaidScore = 206.835 - 1.015*(len(words)/sentences) - 84.6*(totalSyllables/len(words))

    # Return the readability scores
    return gunningFogScore, fleschKincaidScore



# Function to calculate the average readability of a file containing multiple comments
def average_readability(file_path):
    # Create a list to store the readability scores of each comment
    scores = []

    # Open the file containing the comments
    with open(file_path, 'r') as f:
        text = f.read()
        # Extract all the comments from the file
        comments = re.findall(r'(?:/\*(?:[^*]|(?:\*+[^*/]))*\*+/)|(?://.*)', text)

        # If there are no comments in the file, return a default score
        if not comments:
            return 20, 0

        # Loop through each comment in the file and calculate its readability score
        for comment in comments:
            # Only consider comments that contain at least one word
            if (len(re.findall(r'\b\w+\b', comment)) != 0):
                scores.append(readability(comment))

        # If no comments meet the criteria, return a default score
        if (len(scores) == 0):
            return 20, 0

        readabilityGF = 0
        readabilityFK = 0
        # Calculate the average readability scores of all the comments
        for score in scores:
            readabilityGF += score[0]
            readabilityFK += score[1]

        # Return the average readability scores
        return readabilityGF/len(scores), readabilityFK/len(scores)


(11.614285714285714, 21.93660714285717)

Number of identifiers, Number of identifiers containing english words, Number of unique identifiers, Number of unique identifiers containing english words, Number of meaningful identifiers, Average length of identifiers

In [5]:
import re
import tokenize
from io import BytesIO
import keyword
import spacy
nlp = spacy.load("en_core_web_sm")
import spellchecker

# List of Java keywords
java_keywords = ["abstract", "continue", "for", "new", "switch", "assert", "default", "goto", "package", "synchronized", "boolean", "do", "if", "private", "this", "break", "double", "implements", "protected", "throw", "byte", "else", "import", "public", "throws", "case", "enum", "instanceof", "return", "transient", "catch", "extends", "int", "short", "try", "char", "final", "interface", "static", "void", "class", "finally", "long", "strictfp", "volatile", "const", "float", "native", "super", "while", "Object", "null", "true", "false"]



# This function checks if a given word is in english dictionary
def contains_in_vocab(word):
    # Split the line into words based on capital letters
    words = re.findall(r"[A-Z]?[a-z]+|'[^']+'|\"[^\"]+\"", word)
    tokens = []

    for word in words:
        # Check if word is enclosed in quotes
        if not (word.startswith("'") and word.endswith("'") or word.startswith('"') and word.endswith('"')):
            # Tokenize the word if it's not enclosed in quotes
            tokens.extend(tokenize.tokenize(BytesIO(word.encode("utf-8")).readline))
    
    for token_type, token_string, _, _, _ in tokens:
        spell = spellchecker.SpellChecker()
        if (spell.correction(token_string) == token_string):
            return True
    return False



# This function tokenizes a given line of code
def tokenizeLine(line):    
    # Split the line into words based on non-alphanumeric characters
    words = re.findall(r"[A-Za-z]+|'[^']+'|\"[^\"]+\"", line)
    tokens = []
    for word in words:
        if not (word.startswith("'") and word.endswith("'") or word.startswith('"') and word.endswith('"')):
            tokens.extend(tokenize.tokenize(BytesIO(word.encode("utf-8")).readline))
    return tokens



# This function counts the number of identifiers, English identifiers, unique identifiers, unique English identifiers, 
# and identifiers with length between 6 and 9 characters in a given file.
def identifiers(file_path):
    # Read the file
    with open(file_path, "r") as file:
        code = file.read()
    
    # Initialize variables
    lines = code.split("\n")
    identifiers_eng = []
    identifier_set = []
    numId6_9 = 0
    id_len = []

    for line in lines:
        tokens = tokenizeLine(line)
        
        # Check if the token is an identifier and not a Java keyword
        for token_type, token_string, _, _, _ in tokens:
            if token_type == tokenize.NAME and token_string not in java_keywords:
                identifier_set.append(token_string)
                id_len.append(len(token_string))

                # Check if the identifier is an English word
                if (contains_in_vocab(token_string)):
                    identifiers_eng.append(token_string)
                # Check if the identifier has length between 6 and 9 characters
                if (len(token_string) <= 9 and len(token_string) >= 6):
                    numId6_9 += 1
    
    # Count the number of unique identifiers and unique English identifiers
    unique_identifiers = set(identifier_set)
    unique_eng_identifiers = set(identifiers_eng)

    return len(identifier_set), len(identifiers_eng), len(unique_identifiers), len(unique_eng_identifiers), numId6_9, sum(id_len)/len(identifier_set)



(29, 25, 19, 16, 10, 5.862068965517241)

Number of blocks of codes in the same indentation level

In [6]:
# This function takes in a filepath and returns the number of indentation blocks in the file
def count_blocks(filepath):
    # Open the file and read all the lines
    with open(filepath, 'r') as file:
        lines = file.readlines()
    
    # Create an empty list to store the number of blocks and initialize the current block indentation to None
    blocks = []
    current_block_indentation = None
    
    # Loop through all the lines
    for line in lines:
        # Strip any whitespace from the beginning of the line
        stripped_line = line.lstrip()
        
         # If the line is not empty
        if stripped_line:
            indentation = len(line) - len(stripped_line)
            
            # If there is no current block indentation, create a new block and add it to the list
            if current_block_indentation is None:
                current_block_indentation = indentation
                blocks.append(1)

            # If the current indentation is the same as the previous block, add it to that block
            elif indentation == current_block_indentation:
                blocks[-1] += 1
            
            # Otherwise, create a new block and add it to the list
            else:
                current_block_indentation = indentation
                blocks.append(1)
    
    # Return the number of blocks
    return len(blocks)




5

Max indentation length

In [7]:
# Given a file path to a Java code file, this function reads the file and 
# returns the maximum indentation length found in the file.
def max_indentation_length(file_path):
    # Open the file 
    with open(file_path, 'r') as f:
        # Read all the lines of the file and store them in a list
        lines = f.readlines()

    # Initialize the maximum indentation to 0
    max_indentation = 0

    # Initialize the current indentation to 0
    current_indentation = 0

    # Loop through each line of the file
    for line in lines:
        # Remove any leading whitespaces from the line
        stripped_line = line.lstrip()
        
        # Check if the stripped line is not empty and doesn't start with a comment
        if stripped_line and not stripped_line.startswith('//'):
            # Calculate the current indentation by subtracting the length of the stripped line from the length of the original line
            current_indentation = len(line) - len(stripped_line)

            # Check if the current indentation is greater than the maximum indentation
            if current_indentation > max_indentation:
                # Update the maximum indentation to the current indentation
                max_indentation = current_indentation
    
    return max_indentation



20

Number of comment blocks (blocks are defined as a continuous sequence of lines that are made entirely of comments)

In [89]:
# Given a file path to a Java code file, this function reads the file and returns 
# the number of continuous comment code blocks found in the file.
def count_comment_blocks(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Initialize variables
    comment_block_count = 0
    in_multi_line_block = False
    in_pseudo_multi_line_block = False

    # Loop through each line in the file
    for line in lines:
        stripped_line = line.strip()

        # Check if current line starts a multi-line comment block
        if stripped_line.startswith('/*'):
            in_multi_line_block = True
            comment_block_count += 1

        # Check if current line ends a multi-line comment block
        elif stripped_line.endswith('*/'):
            in_multi_line_block = False

        # Check if current line is not a comment, but continues a multi-line block
        elif not (stripped_line.startswith('//')):
           in_pseudo_multi_line_block = False  
        
        elif in_pseudo_multi_line_block:
            continue

        # Check if current line starts a single-line comment
        elif stripped_line.startswith('//'):
            in_pseudo_multi_line_block = True
            comment_block_count += 1

        # Check if current line is within a multi-line comment block
        elif in_multi_line_block:
            comment_block_count += 1

    return comment_block_count


1

# CSV Output

Generate the matrix of features and corresponding values for the 200 code snippets

In [114]:
import os
current_directory = os.getcwd()

subdirectory = "Dataset/Snippets/"
subdirectory_path = os.path.join(current_directory, subdirectory)

# Specifying the path of the directory containing the code snippets
directory_path = subdirectory_path
# Initializing an empty array to store the file paths
file_path_arr = []

# Using os.walk to traverse the directory structure and obtain the file paths
for root, dirs, files in os.walk(directory_path):
    for file in sorted(files, key=lambda x: int(os.path.splitext(x)[0])):
        file_path = os.path.join(root, file)
        file_path_arr.append(file_path)

# Initializing empty arrays to store the feature set and the feature set for each code snippet respectively
feature_matrix = []
feature_arr_row = []

# Initializing the feature set column names
feature_set_columns = ["numLines", "numBlankLines", "numChars", "ratioBlank", "avgCharsLine", "halsteadVol", 
                       "ratioComment", "numComment", "commentReadabilityGF", "commentReadabilityFK", "numIdentifier", 
                       "numEngIdentifier", "numNewIdentifier", "numNewEngIdentifier", "avgNumId", "ratioEngIdOverId", 
                       "avgNewId", "ratioNewEngIdOverNewId", "numMeaningfulId", "ratioMeaningufulId", "avgIdLen", 
                       "numIndentBlocks", "ratioIndentNumLines", "maxIndent", "numCommentBlk", "ratioCommentBlock"]

# Iterating through each code snippet
for i in file_path_arr:
    # Obtaining the features of the code snippet
    # numLines, numBlankLines, numChars, 
    # ratioBlank, avgCharsLine
    tmp0 = count_source_lines_chars(i)
    numLines = tmp0[0]
    feature_arr_row.append(numLines)
    feature_arr_row.append(tmp0[1])
    feature_arr_row.append(tmp0[2])
    feature_arr_row.append(tmp0[1]/numLines)
    feature_arr_row.append(tmp0[2]/numLines)

    # halsteadVol
    feature_arr_row.append(halstead_volume(i))
    
    # numComment, ratioComment
    tmp1 = calculate_comment(i)
    feature_arr_row.append(tmp1[0])
    feature_arr_row.append(tmp1[1])

    # commentReadabilityGF, commentReadabilityFK
    tmp2 = average_readability(i)
    feature_arr_row.append(tmp2[0])
    feature_arr_row.append(tmp2[1])
    
    # numIdentifier, numEngIdentifier, 
    # numNewIdentifier, numNewEngIdentifier
    tmp3 = identifiers(i)
    feature_arr_row.append(tmp3[0])
    feature_arr_row.append(tmp3[1])
    feature_arr_row.append(tmp3[2])
    feature_arr_row.append(tmp3[3])

    # avgNumId, ratioEngIdOverId, 
    # avgNewId, ratioNewEngIdOverNewId
    feature_arr_row.append(tmp3[0]/numLines)
    feature_arr_row.append(tmp3[1]/tmp3[0])
    feature_arr_row.append(tmp3[2]/numLines)
    feature_arr_row.append(tmp3[3]/tmp3[2])

    # numMeaningfulId, ratioMeaningfulId,
    # avgIdLen
    feature_arr_row.append(tmp3[4])
    feature_arr_row.append(tmp3[4]/tmp3[0])
    feature_arr_row.append(tmp3[5])

    # numIndentBlocks, ratioIndentNumLines
    tmp4 = count_blocks(i)
    feature_arr_row.append(tmp4)
    feature_arr_row.append(tmp4/numLines)

    # maxIndent, numCommentBlk, ratioCommentBlock
    feature_arr_row.append(max_indentation_length(i))
    tmp5 = count_comment_blocks(i)
    feature_arr_row.append(tmp5)
    feature_arr_row.append(tmp5/numLines)
    
    # Appending the feature set for the code snippet to the feature matrix
    feature_matrix.append(feature_arr_row)
    feature_arr_row = []

Transfer matrix to csv

In [1]:
# generate csv from matrix features
import csv

with open("feature_matrix_1.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(feature_matrix)

NameError: name 'feature_matrix' is not defined