#### Average line length and variability

In [None]:
import statistics

def analyze_code(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    line_lengths = [len(line) for line in lines]
    avg_line_length = sum(line_lengths) / len(line_lengths)
    line_length_variability = statistics.stdev(line_lengths)
    return avg_line_length, line_length_variability


#### Percentage of comment

In [175]:
def comment_percentage(file_path,language):
    with open(file_path) as f:
        code_snippet = f.read()
    comment_count = 0
    total_count = 0
    in_comment = False
    if language in ['python', 'c', 'c++', 'java']:
        if language == 'python':
            comment_symbol = '#'
        elif language == 'c':
            comment_symbol = '//'
        elif language == 'c++':
            comment_symbol = '//'
        elif language == 'java':
            comment_symbol = '//'
        for line in code_snippet.split('\n'):
            total_count += 1
            if in_comment:
                if '*/' in line:
                    in_comment = False
                else:
                    comment_count += 1
                    continue
            if comment_symbol in line:
                comment_count += 1
            elif '/*' in line:
                in_comment = True
                comment_count += 1
        return (comment_count / total_count) * 100
    else:
        return "Invalid language"

#### Indentation variability

In [181]:
import statistics

def indentation_variability_file_std_dev(file_path):
    with open(file_path, 'r') as f:
        code = f.read()
    lines = code.split("\n")
    indents = [len(line) - len(line.lstrip()) for line in lines if line.strip() != ""]
    return statistics.stdev(indents)



#### Percentage of identifier terms in dictionary and percentage of these identifiers present in the english dictionary

In [186]:
import re
import tokenize
from io import BytesIO
import keyword
import spacy
nlp = spacy.load("en_core_web_sm")
import spellchecker

java_keywords = ["abstract", "continue", "for", "new", "switch", "assert", "default", "goto", "package", "synchronized", "boolean", "do", "if", "private", "this", "break", "double", "implements", "protected", "throw", "byte", "else", "import", "public", "throws", "case", "enum", "instanceof", "return", "transient", "catch", "extends", "int", "short", "try", "char", "final", "interface", "static", "void", "class", "finally", "long", "strictfp", "volatile", "const", "float", "native", "super", "while", "Object", "null", "true", "false"]
c_keywords = ["auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long", "register", "restrict", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while", "_Alignas", "_Alignof", "_Atomic", "_Bool", "_Complex", "_Generic", "_Imaginary", "_Noreturn", "Static_assert", "Thread_local"]

def contains_in_vocab(word):
    # Split the line into words based on capital letters
    words = re.findall(r"[A-Z]?[a-z]+|'[^']+'|\"[^\"]+\"", word)
    tokens = []
    for word in words:
        # Check if word is enclosed in quotes
        if not (word.startswith("'") and word.endswith("'") or word.startswith('"') and word.endswith('"')):
            # Tokenize the word if it's not enclosed in quotes
            tokens.extend(tokenize.tokenize(BytesIO(word.encode("utf-8")).readline))
    
    for token_type, token_string, _, _, _ in tokens:
        spell = spellchecker.SpellChecker()
        if (spell.correction(token_string) == token_string):
            return True
    return False

def tokenizeLine(line):    
    # Split the line into words based on non-alphanumeric characters
    words = re.findall(r"[A-Za-z]+|'[^']+'|\"[^\"]+\"", line)
    tokens = []
    for word in words:
        if not (word.startswith("'") and word.endswith("'") or word.startswith('"') and word.endswith('"')):
            tokens.extend(tokenize.tokenize(BytesIO(word.encode("utf-8")).readline))
    return tokens

def avg_identifiers_per_line(file_path, lang):
    with open(file_path, "r") as file:
        code = file.read()
    lines = code.split("\n")
    identifiers = []
    identifiers_eng = []

    
    for line in lines:
        count_eng = 0
        count = 0
        
        if lang == "python":
            # Tokenize the line
            tokens = tokenizeLine(line)

            # Count the number of identifiers in the line
            for token_type, token_string, _, _, _ in tokens:
                if token_type == tokenize.NAME and token_string not in keyword.kwlist:
                    count += 1
                    if (contains_in_vocab(token_string)):
                        count_eng += 1


            identifiers.append(count)
            identifiers_eng.append(count_eng)

        elif lang == "java":

            tokens = tokenizeLine(line)
            
            for token_type, token_string, _, _, _ in tokens:
                if token_type == tokenize.NAME and token_string not in java_keywords:
                    count += 1
                    if (contains_in_vocab(token_string)):
                        count_eng += 1
            
            identifiers.append(count)
            identifiers_eng.append(count_eng)
            


        elif lang in ["c", "c++"]:
            
            tokens = tokenizeLine(line)
            
            for token_type, token_string, _, _, _ in tokens:
                if token_type == tokenize.NAME and token_string not in c_keywords:
                    count += 1
                    if (contains_in_vocab(token_string)):
                        count_eng += 1
            
            identifiers.append(count)
            identifiers_eng.append(count_eng)
    
    return sum(identifiers)/len(lines), sum(identifiers_eng)/sum(identifiers)*100

#### Average readability of comments (Gunning Fog)

In [194]:
import re

def gunning_fog(text):
    words = text.split()
    sentences = len(re.findall(r'[^.!?]+[.!?]', text))
    if sentences == 0:
        return -1
    complex_words = len([word for word in words if len(word) >= 3 and re.search(r'[^A-Za-z0-9]', word)])
    score = 0.4 * (len(words) / sentences + 100 * (complex_words / len(words)))
    return score

def average_readability(file_path):
    with open(file_path, 'r') as f:
        text = f.read()
        comments = re.findall(r'(?:/\*(?:[^*]|(?:\*+[^*/]))*\*+/)|(?://.*)', text)
        if not comments:
            return -1
        scores = [gunning_fog(comment) for comment in comments]
        if None in scores:
            return -1
        return sum(scores) / len(scores)


#### Calculating the average readability score from Buse's readability survey results

In [195]:
import pandas as pd

# Name of the CSV file
filename = "/Users/adrian/Desktop/cs-projects/ureca-code-readability/DatasetBW/oracle.csv"

# Reading the CSV file into a DataFrame
df = pd.read_csv(filename, header=0, index_col=0)

# Get the current column names
column_names = df.columns

# Create a new dictionary to map old column names to new column names
new_column_names = dict(zip(column_names, range(0, len(column_names) + 1)))

# Rename the columns
df = df.rename(columns=new_column_names)


# Calculating the average of each column
average = df.mean(numeric_only=True)

# Print the averages
print(average)


1      2.991667
2      3.741667
3      2.691667
4      4.025000
5      4.183333
         ...   
96     3.808333
97     2.900000
98     3.000000
99     2.950000
100    2.125000
Length: 100, dtype: float64


#### Writing results to a csv

In [200]:
import glob


def csvFeatureGenerator(directory, language):
    data = [["Avg line length", "Line length variability", "Percentage of comments", "Indentation Variability", "Avg num of identifiers", "Percentage of identifers in dictionary", "Avg readability of comments"]]

    # get a list of all text files
    files = glob.glob(directory)

    # sort the list of files by alphabetical order
    
    files = sorted(files)

    # iterate over the sorted list of files
    for filename in files:
        result = []
        
        for i in range(5):
            if i == 0:
                lineLenVar = analyze_code(filename)
                result.append(lineLenVar[0])
                result.append(lineLenVar[1])
            if i == 1:
                comPerc = comment_percentage(filename, language)
                result.append(comPerc)
            if i == 2:
                indentVar = indentation_variability_file_std_dev(filename)
                result.append(indentVar)
            if i == 3:
                avgIdentifier = avg_identifiers_per_line(filename, language)
                result.append(avgIdentifier[0])
                result.append(avgIdentifier[1])
            if i == 4:
                commentReadability = average_readability(filename)
                result.append(commentReadability)
        data.append(result)
    
    return data


directory = '/Users/adrian/Desktop/cs-projects/ureca-code-readability/DatasetBW/snippets/*.jsnp'
resultList = csvFeatureGenerator(directory, "java")



In [201]:
dataFrame = pd.DataFrame(resultList)
dataFrame.to_csv('annotatedDataSet.csv', index=False)
