#### Average line length

In [1]:
import statistics

def analyze_code(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    line_lengths = [len(line) for line in lines]
    avg_line_length = sum(line_lengths) / len(line_lengths)
    line_length_variability = statistics.stdev(line_lengths)
    print("Average line length:", avg_line_length)
    print("Line length variability:", line_length_variability)

analyze_code("/Users/adrian/Desktop/cs-projects/ureca-code-readability/DatasetBW/snippets/1.jsnp")


Average line length: 20.4
Line length variability: 17.66477977344876


#### Percentage of comment

In [49]:
def comment_percentage(file_path,language):
    with open(file_path) as f:
        code_snippet = f.read()
    comment_count = 0
    total_count = 0
    in_comment = False
    if language in ['python', 'c', 'c++', 'java']:
        if language == 'python':
            comment_symbol = '#'
        elif language == 'c':
            comment_symbol = '//'
        elif language == 'c++':
            comment_symbol = '//'
        elif language == 'java':
            comment_symbol = '//'
        for line in code_snippet.split('\n'):
            total_count += 1
            if in_comment:
                if '*/' in line:
                    in_comment = False
                else:
                    comment_count += 1
                    continue
            if comment_symbol in line:
                comment_count += 1
            elif '/*' in line:
                in_comment = True
                comment_count += 1
        return (comment_count / total_count) * 100
    else:
        return "Invalid language"

comment_percentage("/Users/adrian/Desktop/cs-projects/ureca-code-readability/DatasetBW/snippets/7.jsnp", 'java')

41.66666666666667

#### Average number of identifiers per line

In [58]:
import tokenize
from io import BytesIO
import keyword

def avg_identifiers_per_line(file_path, lang):
    with open(file_path, "r") as file:
        code = file.read()
    lines = code.split("\n")
    identifiers = []
    for line in lines:
        if lang == "python":
            # Tokenize the line
            tokens = tokenize.tokenize(BytesIO(line.encode("utf-8")).readline)
            # Count the number of identifiers in the line
            count = sum(1 for token_type, token_string, _, _, _ in tokens if token_type == tokenize.NAME and token_string not in keyword.kwlist)
            identifiers.append(count)
        elif lang == "java":
            # Use regular expression to match any sequence of letters and numbers,
            # preceded by a letter, underscore, or dollar sign
            matches = re.findall(r"[a-zA-Z_$][a-zA-Z0-9_$]*", line)
            print(matches)
            java_keywords = ["abstract", "continue", "for", "new", "switch", "assert", "default", "goto", "package", "synchronized", "boolean", "do", "if", "private", "this", "break", "double", "implements", "protected", "throw", "byte", "else", "import", "public", "throws", "case", "enum", "instanceof", "return", "transient", "catch", "extends", "int", "short", "try", "char", "final", "interface", "static", "void", "class", "finally", "long", "strictfp", "volatile", "const", "float", "native", "super", "while", "Object", "null", "true", "false"]
            count = len([x for x in matches if x not in java_keywords])
            identifiers.append(count)
        elif lang in ["c", "c++"]:
            c_keywords = ["auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long", "register", "restrict", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while", "_Alignas", "_Alignof", "_Atomic", "_Bool", "_Complex", "_Generic", "_Imaginary", "_Noreturn", "Static_assert", "Thread_local"]
            matches = re.findall(r"[a-zA-Z][a-zA-Z0-9]*", line)
            count = len([x for x in matches if x not in c_keywords])
            identifiers.append(count)   
    return sum(identifiers)


avg_identifiers_per_line("/Users/adrian/Desktop/cs-projects/ureca-code-readability/DatasetBW/snippets/1.jsnp", "java")

[]
['Object', 'ret', 'body', 'eval', 'callstack', 'interpreter']
[]
['boolean', 'breakout', 'false']
['if', 'ret', 'instanceof', 'ReturnControl']
[]
['switch', 'ReturnControl', 'ret', 'kind']
[]
['case', 'RETURN']
['return', 'ret']
[]


14