In [1]:
'''
    [ Sourcecode Handwriting Extraction Preprocessor ]
        Remove Strings ("[REMOVED]") and Comments (//[REMOVED], /* [REMOVED] */)
        Made by Chu7zpah
'''
import re

string_exist = re.compile(r'"(.*?)"')   # RE that checks is there any string(" ~ ") in code
comment_space = re.compile(r'//\s\S+.*')
comment_NO_space = re.compile(r'//\S+.*')
comment_multiline_in_one = re.compile(r'/\*\s\S+.*\s\*/')
comment_multiline_start = re.compile(r'/\*')
comment_multiline_end = re.compile(r'\*/')

f = open('./test.c', 'r')
raw_line_list = f.readlines()
no_string_line_list = []
cleaned_line_list = []

for raw_line in raw_line_list:
    # Exclude Strings
    if string_exist.search(raw_line) is None:
        no_string_line_list.append(raw_line)
    elif string_exist.search(raw_line) is not None:
        for string in string_exist.findall(raw_line):       
            no_string_line_list.append(raw_line.replace(f'"{string}"', '""')) 

flag = 0
for no_string_line in no_string_line_list:
    # Multi Line Comments Checking
    if flag:
        if comment_multiline_end.search(no_string_line) is not None:
            flag = 0
        else:
            continue

    # Exclude One Line Comments
    if comment_space.search(no_string_line) is not None:
        for comment in comment_space.findall(no_string_line):
            cleaned_line_list.append(no_string_line.replace(f'{comment}', f'// {comment[3]}'))
    elif comment_NO_space.search(no_string_line) is not None:
        for comment in comment_NO_space.findall(no_string_line):
            cleaned_line_list.append(no_string_line.replace(f'{comment}', f'//{comment[2]}'))
    elif comment_multiline_in_one.search(no_string_line) is not None:
        for comment in comment_multiline_in_one.findall(no_string_line):
            cleaned_line_list.append(no_string_line.replace(f'{comment}', '/* */'))
    # Exclude Multi Line Comments
    elif comment_multiline_start.search(no_string_line) is not None:
        cleaned_line_list.append(no_string_line)
        flag = 1
    else:
        cleaned_line_list.append(no_string_line)

f.close()

In [2]:
'''
    [ Sourcecode Handwriting Characteristics Extractor ]
        From Source Code(C Based) -> CSV
        Made by Chu7zpah
      
      <Features>
        Feature 1. Brace('{') Habits
            case 1: 'FUNCTION(){'       (inline brace with NO Space)
            case 2: 'FUNCTION() {'      (inline brace with Space)
            case 3: 'FUNCTION()         (next line brace)
                     {'          

        Feature 2. Comment('//' or '/* */') Habits
            case 1: '//COMMENT'         ('//' with NO Space)
            case 2: '// COMMENT'        ('//' with Space)
            case 3: '/* COMMENT */'     (using '/* */')

        Feature 3. Parenthesis('(') Habits
            case 1: '{if/switch/while/for}('    (Reserved words with No Space)
            case 2: '{if/switch/while/for} ('   (Reserved words with Space)

        Feature 4. Unary Operator(Type Casting) Habits
            case 1: '(TYPE)VARIABLE'        (Type Casting with NO Space)
            case 2: '(TYPE) VARIABLE'       (Type Casting with Space)
        
        Feature 5. Binary Operator(Arithmetic, Assignments, etc.) Habits
            ex) '+' Operator
            case 1: 'VARIABLE+VARIABLE'     (NO Space && Binary Operator && NO Space)
            case 2: 'VARIABLE+ VARIABLE'    (NO Space && Binary Operator && Space)
            case 3: 'VARIABLE +VARIABLE'    (Space && Binary Operator && NO Space)
            case 4: 'VARIABLE + VARIABLE'   (Space && Binary operator && Space)
'''
features_list = [   
                    'inline_NO_brace', 'inline_YES_brace', 'nextline_brace', # Feature 1. Brace
                    'double_backslash_NO_Space', 'double_backslash_YES_Space', 'single_backslash_asterisk', # Feature 2. Comment
                    'NO_parenthesis', 'YES_parenthesis', # Feature 3. Parenthesis
                    'unary_NO_space', 'unary_YES_space', # Feature 4. Unary Operator
                    'NO_binary_NO', 'NO_binary_YES', 'YES_binary_NO', 'YES_binary_YES'  # Feature 5. Binary Opeartor
                ]

print("[Feature Columns]")
print(features_list)


# <Extracted Feature Data Dictionary>
features_data_dictionary = {}
for feature in features_list:       # Initialize counting variables
    features_data_dictionary[f'{feature}_count'] = 0
print("\n[initialized Feature data dictionary]")
print(features_data_dictionary)


# <Feature RE Expressions>
inline_NO_brace = re.compile('[\)|\S]\{')       # 'Function(){' (RE: '\)\{')  OR  'struct{' (RE: \S\{)
inline_YES_brace = re.compile('[\)|\S]\s+\{')   # 'Function() {' (RE: '\)\s+\{')  OR  'struct {' (RE: \S\s+\{)
nextline_brace = re.compile('\s*\{')            # 'ENTER' + '{' (RE: '\s*\{')

double_backslash_NO_Space = re.compile('//\S')      # '//Comment'
double_backslash_YES_Space = re.compile('//\s+\S')  # '// Comment'
single_backslash_asterisk = re.compile('/\*')       # '/* Comment */'

NO_parenthesis = re.compile('if\(|switch\(|for\(|while\(')                  # '{if/switch/while/for}('
YES_parenthesis = re.compile('if\s+\(|switch\s+\(|for\s+\(|while\s+\(')     # '{if/switch/while/for} ('


# Type list for Unary Operator(Type Casting)
type_list = ['char', 'short', 'int', 'long', 'long long', 
            'unsigned char', 'unsigned short', 'unsigned int', 'unsigned long',
            'float', 'double', 'long double']

# Operator list for Binary Opeartors
operator_list = [   '\+', '-', '\*', '/', '%',    # Arithmetic
                    '=', '\+=', '-=', '\*=', '/=', '%=', 
                    '&=', '^=', '\|=', '<<=', '>>=', '>>>=', # Assignment
                    '==', '!=', '>', '<', '>=', '<=',       # Comparison
                    '&&', '\|\|',    # Logical
                    '>>', '<<', '>>>', # Shift
                    ';', ',', '",']  # ';' in for loop, ',' in function call


# <Analyzing Source Code>
for line in cleaned_line_list:
    # Preprocessing (Exclude Strings)  
    if string_exist.search(line) is not None:
        for string in string_exist.findall(line):       
            line = line.replace(f'"{string}"', '""')
    

    # Feature Extraction
    #   (Feature 1 - Brace)
    if inline_NO_brace.search(line) is not None:
        features_data_dictionary['inline_NO_brace_count'] += 1
    elif inline_YES_brace.search(line) is not None:
        features_data_dictionary['inline_YES_brace_count'] += 1
    elif nextline_brace.search(line) is not None:
        features_data_dictionary['nextline_brace_count'] += 1

    #   (Feature 2 - Comment)
    if double_backslash_NO_Space.search(line) is not None:
        features_data_dictionary['double_backslash_NO_Space_count'] += 1
    elif double_backslash_YES_Space.search(line) is not None:
        features_data_dictionary['double_backslash_YES_Space_count'] += 1
    elif single_backslash_asterisk.search(line) is not None:
        features_data_dictionary['single_backslash_asterisk_count'] += 1

    #   (Feature 3 - Parenthesis)
    if NO_parenthesis.search(line) is not None:
        features_data_dictionary['NO_parenthesis_count'] += 1
    elif YES_parenthesis.search(line) is not None:
        features_data_dictionary['YES_parenthesis_count'] += 1

    #   (Feature 4 - Unary Operator)
    for data_type in type_list:
        unary_NO_space = re.compile(rf'\({data_type}\)\S')
        unary_YES_space = re.compile(rf'\({data_type}\)\s+\S')
        
        if unary_NO_space.search(line) is not None:
            features_data_dictionary['unary_NO_space_count'] += 1
        elif unary_YES_space.search(line) is not None:
            features_data_dictionary['unary_YES_space_count'] += 1

    #   (Feature 5 - Binary Operator)
    for operator in operator_list:
        NO_binary_NO = re.compile(rf'(\w|\)|\]){operator}(\w|\(|\[)')
        NO_binary_YES = re.compile(rf'(\w|\)|\]){operator}\s+(\w|\(|\[)')
        YES_binary_NO = re.compile(rf'(\w|\)|\])\s+{operator}(\w|\(|\[)')
        YES_binary_YES = re.compile(rf'(\w|\)|\])\s+{operator}\s+(\w|\(|\[)')

        if NO_binary_NO.search(line) is not None:
            features_data_dictionary['NO_binary_NO_count'] += 1
            print("NONO", line)
        elif NO_binary_YES.search(line) is not None:
            features_data_dictionary['NO_binary_YES_count'] += 1
            print("NOYES", line, "OPERATOR: ", operator)
        elif YES_binary_NO.search(line) is not None:
            features_data_dictionary['YES_binary_NO_count'] += 1
            print("YESNO", line)
        elif YES_binary_YES.search(line) is not None:
            features_data_dictionary['YES_binary_YES_count'] += 1
            print("YESYES", line)

brace_total =         features_data_dictionary['inline_NO_brace_count'] \
                    + features_data_dictionary['inline_YES_brace_count'] \
                    + features_data_dictionary['nextline_brace_count']
comment_total =       features_data_dictionary['double_backslash_NO_Space_count'] \
                    + features_data_dictionary['double_backslash_YES_Space_count'] \
                    + features_data_dictionary['single_backslash_asterisk_count']
parenthesis_total =   features_data_dictionary['NO_parenthesis_count'] \
                    + features_data_dictionary['YES_parenthesis_count']
unary_total =         features_data_dictionary['unary_NO_space_count'] \
                    + features_data_dictionary['unary_YES_space_count']
binary_total =        features_data_dictionary['NO_binary_NO_count'] \
                    + features_data_dictionary['NO_binary_YES_count'] \
                    + features_data_dictionary['YES_binary_NO_count'] \
                    + features_data_dictionary['YES_binary_YES_count']

print("\n[Calculated Feature data dictionary]")
print(features_data_dictionary)

print("\n[Total Values]")
print("Brace:", brace_total)
print("comment:", comment_total)
print("parenthesis:", parenthesis_total)
print("unary:", unary_total)
print("binary:", binary_total)

[Feature Columns]
['inline_NO_brace', 'inline_YES_brace', 'nextline_brace', 'double_backslash_NO_Space', 'double_backslash_YES_Space', 'single_backslash_asterisk', 'NO_parenthesis', 'YES_parenthesis', 'unary_NO_space', 'unary_YES_space', 'NO_binary_NO', 'NO_binary_YES', 'YES_binary_NO', 'YES_binary_YES']

[initialized Feature data dictionary]
{'inline_NO_brace_count': 0, 'inline_YES_brace_count': 0, 'nextline_brace_count': 0, 'double_backslash_NO_Space_count': 0, 'double_backslash_YES_Space_count': 0, 'single_backslash_asterisk_count': 0, 'NO_parenthesis_count': 0, 'YES_parenthesis_count': 0, 'unary_NO_space_count': 0, 'unary_YES_space_count': 0, 'NO_binary_NO_count': 0, 'NO_binary_YES_count': 0, 'YES_binary_NO_count': 0, 'YES_binary_YES_count': 0}
YESNO #include <stdio.h>

NONO     int a=0; // 2

YESYES     if(a == 0) {    // 2

NOYES         t= (float) a;  //1
 OPERATOR:  =
YESYES         t = 0;  //3

NOYES     for (int i= 0 ; i < 5 ; i++)
 OPERATOR:  =
YESYES     for (int i= 0 ; i <

In [6]:
'''
    Making a CSV File
'''
import csv

double_quote_feature_list = []
for feature in features_list:
    double_quote_feature_list.append(feature.replace(f'{feature}', f'"{feature}"')) 
double_quote_feature_list.append(r'"Programmer"')
print(double_quote_feature_list)

csv_file = open('./optest.csv', 'w', encoding='utf-8')
wr = csv.writer(csv_file, quotechar="'")    # quotechar = '' (If not, " becomes """)
wr.writerow(double_quote_feature_list)

csv_file.close()

['"inline_NO_brace"', '"inline_YES_brace"', '"nextline_brace"', '"double_backslash_NO_Space"', '"double_backslash_YES_Space"', '"single_backslash_asterisk"', '"NO_parenthesis"', '"YES_parenthesis"', '"unary_NO_space"', '"unary_YES_space"', '"NO_binary_NO"', '"NO_binary_YES"', '"YES_binary_NO"', '"YES_binary_YES"', '"Programmer"']


In [7]:
import os

def search(dirname):
    try:
        filenames = os.listdir(dirname)
        for filename in filenames:
            full_filename = os.path.join(dirname, filename)
            if os.path.isdir(full_filename):
                search(full_filename)
            else:
                ext = os.path.splitext(full_filename)[-1]
                if ext == '.c':
                    print(full_filename)
    except PermissionError:
        pass

search('C:/Users/Nyx_24/Desktop/Dataset')

C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\ec.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\error.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\eval.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\exec-operand.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\exec.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\libc.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\ns.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\object.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\opregion.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\os_methods.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\pc-bios.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\pci.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\pm.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\resource.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\sci.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\timer.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\variable.c
C:/Users/Nyx_24/Desktop/Dataset\A\(18)lai\vsnprintf.c
C:/Users/Nyx_24/Desktop/Dataset\A\(52)qword\acpi.c
C:/Users/Nyx_24/Desktop/Dataset\A\(52

In [15]:
import os

programmer_list = ['A', 'B', 'C']

for programmer in programmer_list:
    for (path, dir, files) in os.walk(f'C:/Users/Nyx_24/Desktop/Dataset/{programmer}'):
        for filename in files:
            ext = os.path.splitext(filename)[-1]
            if ext == '.c':
                full_directory = path + '/' + filename
                full_directory = full_directory.replace('\\', '/')
                print(full_directory)

C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/ec.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/error.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/eval.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/exec-operand.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/exec.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/libc.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/ns.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/object.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/opregion.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/os_methods.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/pc-bios.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/pci.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/pm.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/resource.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/sci.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/timer.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/variable.c
C:/Users/Nyx_24/Desktop/Dataset/A/(18)lai/vsnprintf.c
C:/Users/Nyx_24/Desktop/Dataset/A/(52)qword/acpi.c
C:/Users/Nyx_24/Desktop/Dataset/A/(52