#### Average line length

In [160]:
import statistics

def analyze_code(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    line_lengths = [len(line) for line in lines]
    avg_line_length = sum(line_lengths) / len(line_lengths)
    line_length_variability = statistics.stdev(line_lengths)
    print("Average line length:", avg_line_length)
    print("Line length variability:", line_length_variability)

analyze_code("/Users/adrian/Desktop/cs-projects/ureca-code-readability/DatasetBW/snippets/1.jsnp")


Average line length: 19.5
Line length variability: 16.58743069745454


#### Percentage of comment

In [161]:
def comment_percentage(file_path,language):
    with open(file_path) as f:
        code_snippet = f.read()
    comment_count = 0
    total_count = 0
    in_comment = False
    if language in ['python', 'c', 'c++', 'java']:
        if language == 'python':
            comment_symbol = '#'
        elif language == 'c':
            comment_symbol = '//'
        elif language == 'c++':
            comment_symbol = '//'
        elif language == 'java':
            comment_symbol = '//'
        for line in code_snippet.split('\n'):
            total_count += 1
            if in_comment:
                if '*/' in line:
                    in_comment = False
                else:
                    comment_count += 1
                    continue
            if comment_symbol in line:
                comment_count += 1
            elif '/*' in line:
                in_comment = True
                comment_count += 1
        return (comment_count / total_count) * 100
    else:
        return "Invalid language"

comment_percentage("/Users/adrian/Desktop/cs-projects/ureca-code-readability/DatasetBW/snippets/7.jsnp", 'java')

41.66666666666667

#### Average readability of comments (Gunning Fog)

In [163]:
import re

def gunning_fog(text):
    words = text.split()
    sentences = len(re.findall(r'[^.!?]+[.!?]', text))
    complex_words = len([word for word in words if len(word) >= 3 and re.search(r'[^A-Za-z0-9]', word)])
    score = 0.4 * (len(words) / sentences + 100 * (complex_words / len(words)))
    return score

def average_readability(file_path):
    with open(file_path, 'r') as f:
        text = f.read()
        comments = re.findall(r'(?:/\*(?:[^*]|(?:\*+[^*/]))*\*+/)|(?://.*)', text)
        if not comments:
            return -1
        scores = [gunning_fog(comment) for comment in comments]
        return sum(scores) / len(scores)


print(average_readability('/Users/adrian/Desktop/cs-projects/ureca-code-readability/DatasetBW/snippets/7.jsnp'))


15.03478260869565


#### Percentage of identifier terms in dictionary and percentage of these identifiers present in the english dictionary

In [69]:
import re
import tokenize
from io import BytesIO
import keyword
import spacy
nlp = spacy.load("en_core_web_sm")
import spellchecker

java_keywords = ["abstract", "continue", "for", "new", "switch", "assert", "default", "goto", "package", "synchronized", "boolean", "do", "if", "private", "this", "break", "double", "implements", "protected", "throw", "byte", "else", "import", "public", "throws", "case", "enum", "instanceof", "return", "transient", "catch", "extends", "int", "short", "try", "char", "final", "interface", "static", "void", "class", "finally", "long", "strictfp", "volatile", "const", "float", "native", "super", "while", "Object", "null", "true", "false"]
c_keywords = ["auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long", "register", "restrict", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while", "_Alignas", "_Alignof", "_Atomic", "_Bool", "_Complex", "_Generic", "_Imaginary", "_Noreturn", "Static_assert", "Thread_local"]

def contains_in_vocab(word):
    # Split the line into words based on capital letters
    words = re.findall(r"[A-Z]?[a-z]+|'[^']+'|\"[^\"]+\"", word)
    tokens = []
    for word in words:
        # Check if word is enclosed in quotes
        if not (word.startswith("'") and word.endswith("'") or word.startswith('"') and word.endswith('"')):
            # Tokenize the word if it's not enclosed in quotes
            tokens.extend(tokenize.tokenize(BytesIO(word.encode("utf-8")).readline))
    
    for token_type, token_string, _, _, _ in tokens:
        spell = spellchecker.SpellChecker()
        if (spell.correction(token_string) == token_string):
            return True
    return False

def tokenizeLine(line):    
    # Split the line into words based on non-alphanumeric characters
    words = re.findall(r"[A-Za-z]+|'[^']+'|\"[^\"]+\"", line)
    tokens = []
    for word in words:
        if not (word.startswith("'") and word.endswith("'") or word.startswith('"') and word.endswith('"')):
            tokens.extend(tokenize.tokenize(BytesIO(word.encode("utf-8")).readline))
    return tokens

def avg_identifiers_per_line(file_path, lang):
    with open(file_path, "r") as file:
        code = file.read()
    lines = code.split("\n")
    identifiers = []
    identifiers_eng = []

    
    for line in lines:
        count_eng = 0
        count = 0
        
        if lang == "python":
            # Tokenize the line
            tokens = tokenizeLine(line)

            # Count the number of identifiers in the line
            for token_type, token_string, _, _, _ in tokens:
                if token_type == tokenize.NAME and token_string not in keyword.kwlist:
                    count += 1
                    if (contains_in_vocab(token_string)):
                        count_eng += 1


            identifiers.append(count)
            identifiers_eng.append(count_eng)

        elif lang == "java":

            tokens = tokenizeLine(line)
            
            for token_type, token_string, _, _, _ in tokens:
                if token_type == tokenize.NAME and token_string not in java_keywords:
                    count += 1
                    if (contains_in_vocab(token_string)):
                        count_eng += 1
            
            identifiers.append(count)
            identifiers_eng.append(count_eng)
            


        elif lang in ["c", "c++"]:
            
            tokens = tokenizeLine(line)
            
            for token_type, token_string, _, _, _ in tokens:
                if token_type == tokenize.NAME and token_string not in c_keywords:
                    count += 1
                    if (contains_in_vocab(token_string)):
                        count_eng += 1
            
            identifiers.append(count)
            identifiers_eng.append(count_eng)
    
    return sum(identifiers)/len(lines), sum(identifiers_eng)/sum(identifiers)*100, sum(identifiers), sum(identifiers_eng)


result = avg_identifiers_per_line("/Users/adrian/Desktop/cs-projects/ureca-code-readability/DatasetBW/snippets/2.jsnp", "java")
print(result)

(1.4166666666666667, 100.0, 17, 17)
