# This program attempts to predict the mutation score 

In [87]:
from sklearn import datasets
import pandas as pd

## Read and Process Dataframe

Combining data from the following analysis: 
* lazytracker.json
* 101_intersection.json
* 101_palindrome.json
* 101_sorting.json

Drop NA values
Drop "test_" functions

In [88]:
def process_json_file(filepath):
    sub_df = pd.read_json(filepath)
    sub_df = sub_df.dropna()
    sub_df = sub_df[~sub_df['function_name'].str.startswith("test_")]
    return sub_df

In [89]:
import os

In [90]:
dataframes = []
folder_path = "../json_analysis"

for filename in os.listdir(folder_path):
    if filename.endswith("json"):
        filepath = os.path.join(folder_path, filename)
        sub_df = process_json_file(filepath)
        sub_df.insert(0,"filename",filename)
        dataframes.append(sub_df)

df = pd.concat(dataframes, ignore_index=True)

In [91]:
df

Unnamed: 0,filename,function_name,function_scope,patterns,mutants,mutation_score
0,101_sorting.json,bubble_sort,21-38,"[{'lineno': 21, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #167', 'line': 21, 'descript...",0.0
1,101_sorting.json,insertion_sort,41-60,"[{'lineno': 41, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #182', 'line': 44, 'descript...",0.0
2,101_sorting.json,merge,63-93,"[{'lineno': 63, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #191', 'line': 64, 'descript...",0.0
3,101_sorting.json,generate_random_number,17-22,"[{'lineno': 17, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #166', 'line': 18, 'descript...",0.0
4,101_sorting.json,generate_random_container,25-33,"[{'lineno': 25, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #170', 'line': 25, 'descript...",0.0
5,101_sorting.json,run_sorting_algorithm,36-52,"[{'lineno': 36, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #178', 'line': 36, 'descript...",0.0
6,101_sorting.json,run_sorting_algorithm_experiment_campaign,55-81,"[{'lineno': 55, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #187', 'line': 57, 'descript...",0.0
7,101_sorting.json,listsorting,45-78,"[{'lineno': 45, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #183', 'line': 46, 'descript...",0.0
8,101_intersection.json,human_readable_boolean,30-36,"[{'lineno': 30, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #11', 'line': 34, 'descripti...",0.0
9,101_intersection.json,generate_random_container,39-47,"[{'lineno': 39, 'coloffset': 0, 'linematch': '...","[{'name': 'Mutant #13', 'line': 40, 'descripti...",33.333333


**Dropping all mutation_score with 'NA' values**

### Parsing data: check_ids and patterns

In [92]:
def extract_pattern_id(df):
    df['check_ids'] = df['patterns'].apply(lambda x: [pattern['check_id'] for pattern in x if 'check_id' in pattern])
    df['check_ids'] = df['check_ids'].apply(pd.unique)
    df['unique_patterns'] = df['patterns'].apply(lambda x: [pattern['pattern'] for pattern in x])
    df['unique_patterns'] = df['unique_patterns'].apply(pd.unique)
    return df

In [93]:
df = extract_pattern_id(df.copy())

  df['check_ids'] = df['check_ids'].apply(pd.unique)
  df['unique_patterns'] = df['unique_patterns'].apply(pd.unique)


### Collect all patterns from the config files

In [94]:
import os
import ruamel.yaml

In [95]:

def get_patterns_from_yaml(filepath):
  """
  Extracts patterns from the 'pattern' key under the 'checks' key in a YAML file.

  Args:
      filepath: Path to the YAML file.

  Returns:
      A list of patterns found in the YAML file, or None if parsing fails.
  """
  try:
    with open(filepath) as f:
      yaml = ruamel.yaml.YAML(typ='safe', pure=True)  # Use new API for safe loading
      data = yaml.load(f)
      checks = data.get("checks")  # Get the 'checks' dictionary
      if checks:
        patterns = [check.get("pattern") for check in checks]  # Extract patterns from checks
        return patterns if any(patterns) else None  # Return only if patterns exist
      else:
        print(f"Warning: 'checks' key not found in YAML file: {filepath}")
        return None
  except ruamel.yaml.YAMLError as e:
    print(f"Error parsing YAML file '{filepath}': {e}")
    return None

In [96]:
def collect_config_patterns(config_dir):
  """
  Collects unique patterns from all YAML files in a directory.

  Args:
      config_dir: The directory containing the YAML files.

  Returns:
      A list of unique patterns found in the YAML files.
  """
  all_patterns = []

  # Check if config directory exists
  if not os.path.isdir(config_dir):
    raise ValueError(f"Directory '{config_dir}' does not exist.")
  for filename in os.listdir(config_dir):
    if filename.endswith(".yml"):
      filepath = os.path.join(config_dir, filename)
      patterns = get_patterns_from_yaml(filepath)
      if patterns:
        all_patterns.extend(patterns) #Extend list with patterns from each file
  #Remove duplicates using set
  config_patterns = list(set(all_patterns))

  return config_patterns

In [97]:
config_dir = "../Config"  # Replace with your actual directory path
config_patterns = collect_config_patterns(config_dir)

if config_patterns:
  print("Unique patterns found:",config_patterns)
else:
  print("No patterns found in the YAML files.")

Unique patterns found: ['.//FunctionDef/body//If', '//ImportFrom/keyword[@name="from"]', '//FunctionDef//if//For', '//FunctionDef//If/following-sibling::If | //FunctionDef//If/following-sibling::Elif | //FunctionDef//If/following-sibling::Else', "//FunctionDef[@type='str' and starts-with(@name, 'test_')]/body/*", '//FunctionDef/body/For[target/Name/@id = iter/Name/@id]', '//Compare/left[@id="input_dirs"]/following-sibling::Compare/ops/NotEq | //Compare/comparators[@id="input_dirs"]/following-sibling::Compare/ops/NotEq', '//FunctionDef[body//comprehension/target/Name]', './/ClassDef', '//FunctionDef[not(args/arg/annotation) or not(returns)]', "//function[@name='your_function_name']/*[your_xpath_condition]", './/FunctionDef[not(contains(@name, "test_"))]', '//FunctionDef//If/descendant::If', '//FunctionDef[//(If/following-sibling::For | For/following-sibling::If)]', './/FunctionDef', './/FunctionDef/body//If[ancestor::If and not(parent::orelse)]', '//FunctionDef[@name="tp"]/body/Assign/v

### Count pattern boolean values

In [98]:
#config pattern dictionary
predefined_patterns = {pattern:0 for pattern in config_patterns}

def update_pattern_counts(pattern_list):
    function_pattern = predefined_patterns.copy()
    for pattern in pattern_list:
        if pattern in function_pattern:
            function_pattern[pattern] += 1
    return function_pattern


In [99]:
df = df.assign(pattern_existence = df['unique_patterns'].apply(update_pattern_counts))

In [100]:
# catagorize mutation score 
low_threshold = 33
medium_threshold = 66

def map_to_category(score):
    """Maps a mutation score to a category (high, medium,low)"""
    if score < low_threshold:
        return "low"
    elif score < medium_threshold:
        return "medium"
    else:
        return "high"

In [101]:
df = df.assign(mutation_category = df['mutation_score'].apply(map_to_category))

**Current Dataset**

In [102]:
#Counts of mutation score categories
category_counts = df['mutation_category'].value_counts()
print(category_counts)

mutation_category
high      13
low       10
medium     4
Name: count, dtype: int64


In [103]:
df.to_csv("analysis.csv", index=False)

### Encoding Function Pattern list

In [104]:
def dict_to_list(predefined_patterns):
    pattern_list = [predefined_patterns[pattern] for pattern in predefined_patterns]
    return pattern_list

In [105]:
# Create a new column with pattern existence lists
df['pattern_bool'] = df['pattern_existence'].apply(dict_to_list)

In [106]:
df.to_csv("analysis.csv", index=False)

#### Create a new DataFrame with the pattern list as column names

In [107]:
def create_pattern_df(df):
    df_new = pd.DataFrame(columns=config_patterns+['mutation_category']+['filename'])
    for i in range(len(df)):
        mutation = df['mutation_category'].iloc[i]
        filename = df['filename'].iloc[i]
        pattern_dict = df['pattern_existence'].iloc[i]
        # Extract pattern values 
        pattern_value_list = list(pattern_dict.values())
        #add the pattern_value_list to df_new
        df_new.loc[i] = pattern_value_list + [mutation] + [filename]
    return df_new

In [108]:
pattern_df = create_pattern_df(df.copy())  # Avoid modifying original DataFrame

In [109]:
pattern_df

Unnamed: 0,.//FunctionDef/body//If,"//ImportFrom/keyword[@name=""from""]",//FunctionDef//if//For,//FunctionDef//If/following-sibling::If | //FunctionDef//If/following-sibling::Elif | //FunctionDef//If/following-sibling::Else,"//FunctionDef[@type='str' and starts-with(@name, 'test_')]/body/*",//FunctionDef/body/For[target/Name/@id = iter/Name/@id],"//Compare/left[@id=""input_dirs""]/following-sibling::Compare/ops/NotEq | //Compare/comparators[@id=""input_dirs""]/following-sibling::Compare/ops/NotEq",//FunctionDef[body//comprehension/target/Name],.//ClassDef,//FunctionDef[not(args/arg/annotation) or not(returns)],...,.//Try/ExceptHandler[not(ExceptHandler/type)],//FunctionDef//For[.//For],"//FunctionDef//Call/func/Attribute[@attr=""keys""]","//FunctionDef[@name=""init""]/body/Return",//FunctionDef//FunctionDef/ancestor::*,//FunctionDef[@type='str']/body/Assert,//FunctionDef[@type='str']/body/* | //FunctionDef[@type='str']/body/Return,//Compare/ops/Is | //Compare/ops/Eq,mutation_category,filename
0,1,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,low,101_sorting.json
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
2,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
4,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
6,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
7,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
8,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_intersection.json
9,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,medium,101_intersection.json


In [110]:
pattern_df.to_csv("pattern_df.csv", index=False)

## Train Test Split

In [135]:
from sklearn.model_selection import train_test_split

In [136]:
X_train, X_test, y_train, y_test = train_test_split(pattern_df.iloc[:, :-1], pattern_df['filename','mutation_category'], test_size=0.2, random_state=42)

KeyError: ('filename', 'mutation_category')

## Training of Model with RandomForestClassifier

IMPORTANT: delete the 'filename' column in pattern_df

In [137]:
#importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier

In [138]:
#creating a RF classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth = 5)


In [139]:
clf.fit(X_train,y_train)

In [140]:
y_pred = clf.predict(X_test)
print(y_pred)

['high' 'low' 'low' 'low' 'high' 'low' 'low' 'low' 'medium']


In [141]:
print("All actual vs. predicted categories:")
for i in range(len(y_test)):
  print(f"Sample {i+1}: Actual - {y_test.iloc[i]}, Predicted - {y_pred[i]}")

All actual vs. predicted categories:
Sample 1: Actual - high, Predicted - high
Sample 2: Actual - high, Predicted - low
Sample 3: Actual - high, Predicted - low
Sample 4: Actual - high, Predicted - low
Sample 5: Actual - high, Predicted - high
Sample 6: Actual - high, Predicted - low
Sample 7: Actual - medium, Predicted - low
Sample 8: Actual - high, Predicted - low
Sample 9: Actual - high, Predicted - medium


### Leave-One-Project-Out Cross-Validationn(LOPO CV)

Using K-Fold Cross Validation to iterate K times, randomly select one project to hid (validation set) then combine the remaining projects into a single training set

In [160]:
pattern_df

Unnamed: 0,.//FunctionDef/body//If,"//ImportFrom/keyword[@name=""from""]",//FunctionDef//if//For,//FunctionDef//If/following-sibling::If | //FunctionDef//If/following-sibling::Elif | //FunctionDef//If/following-sibling::Else,"//FunctionDef[@type='str' and starts-with(@name, 'test_')]/body/*",//FunctionDef/body/For[target/Name/@id = iter/Name/@id],"//Compare/left[@id=""input_dirs""]/following-sibling::Compare/ops/NotEq | //Compare/comparators[@id=""input_dirs""]/following-sibling::Compare/ops/NotEq",//FunctionDef[body//comprehension/target/Name],.//ClassDef,//FunctionDef[not(args/arg/annotation) or not(returns)],...,.//Try/ExceptHandler[not(ExceptHandler/type)],//FunctionDef//For[.//For],"//FunctionDef//Call/func/Attribute[@attr=""keys""]","//FunctionDef[@name=""init""]/body/Return",//FunctionDef//FunctionDef/ancestor::*,//FunctionDef[@type='str']/body/Assert,//FunctionDef[@type='str']/body/* | //FunctionDef[@type='str']/body/Return,//Compare/ops/Is | //Compare/ops/Eq,mutation_category,filename
0,1,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,low,101_sorting.json
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
2,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
4,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
6,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
7,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_sorting.json
8,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,low,101_intersection.json
9,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,medium,101_intersection.json


In [None]:
from sklearn.model_selection import train_test_split

In [146]:
from sklearn.metrics import accuracy_score, f1_score

In [158]:
from sklearn.ensemble import RandomForestClassifier

In [159]:
clf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth = 5)
project_wise_accuracy = []
project_wise_f1 = []

# Split test train data
for project_name in set(pattern_df['filename']):
    # Filter data for training (excluding current project)
    X_train = pattern_df[pattern_df['filename'] != project_name].drop(['filename','mutation_category'], axis=1)
    y_train = pattern_df[pattern_df['filename'] != project_name]['mutation_category']
    # Filter data for testing (current project)
    X_test = pattern_df[pattern_df['filename'] == project_name].drop(['filename','mutation_category'], axis=1)
    y_test = pattern_df[pattern_df['filename'] == project_name]['mutation_category']
    f
    # fit model
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    
    # Print project name
    print(f"Project Targeted: {project_name}")
    
    # Print and compare result
    print("All actual vs. predicted categories:")
    for i in range(len(y_test)):
        print(f"Sample {i+1}: Actual - {y_test.iloc[i]}, Predicted - {y_pred[i]}")
        
    # Calculate accuracy and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    project_wise_accuracy.append(accuracy)
    project_wise_f1.append(f1)
    print(f"Accuracy: {accuracy}")
    print(f"F1-Score: {f1}")
    print()

# Average Performance
if project_wise_accuracy:
  print(f"Overall Average Accuracy: {sum(project_wise_accuracy) / len(project_wise_accuracy)}")
  print(f"Overall Average F1-Score: {sum(project_wise_f1) / len(project_wise_f1)}")

Project Targeted: 101_intersection.json
All actual vs. predicted categories:
Sample 1: Actual - low, Predicted - high
Sample 2: Actual - medium, Predicted - high
Sample 3: Actual - high, Predicted - high
Sample 4: Actual - high, Predicted - high
Sample 5: Actual - high, Predicted - high
Sample 6: Actual - high, Predicted - high
Sample 7: Actual - low, Predicted - high
Accuracy: 0.5714285714285714
F1-Score: 0.4155844155844156

Project Targeted: 101_palindrome.json
All actual vs. predicted categories:
Sample 1: Actual - high, Predicted - high
Sample 2: Actual - medium, Predicted - high
Sample 3: Actual - medium, Predicted - high
Accuracy: 0.3333333333333333
F1-Score: 0.16666666666666666

Project Targeted: 101_sorting.json
All actual vs. predicted categories:
Sample 1: Actual - low, Predicted - high
Sample 2: Actual - low, Predicted - high
Sample 3: Actual - low, Predicted - high
Sample 4: Actual - low, Predicted - high
Sample 5: Actual - low, Predicted - high
Sample 6: Actual - low, Pred

#### Training on different algorithms

* Random Forest Classifier 
* Gradient Boosting Classifiers
* K-Nearest Neighbors (KNN) 
* Support Vector Machine (SVM)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth = 5)
project_wise_accuracy = {}
project_wise_f1 = {}

algorithms = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, max_depth=5),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(),
}

# Split test train data
for project_name in set(pattern_df['filename']):
    # Filter data for training (excluding current project)
    X_train = pattern_df[pattern_df['filename'] != project_name].drop(['filename','mutation_category'], axis=1)
    y_train = pattern_df[pattern_df['filename'] != project_name]['mutation_category']
    # Filter data for testing (current project)
    X_test = pattern_df[pattern_df['filename'] == project_name].drop(['filename','mutation_category'], axis=1)
    y_test = pattern_df[pattern_df['filename'] == project_name]['mutation_category']
    f
    # fit model
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    
    # Print project name
    print(f"Project Targeted: {project_name}")
    
    # Print and compare result
    print("All actual vs. predicted categories:")
    for i in range(len(y_test)):
        print(f"Sample {i+1}: Actual - {y_test.iloc[i]}, Predicted - {y_pred[i]}")
        
    # Calculate accuracy and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    project_wise_accuracy.append(accuracy)
    project_wise_f1.append(f1)
    print(f"Accuracy: {accuracy}")
    print(f"F1-Score: {f1}")
    print()

# Average Performance
if project_wise_accuracy:
  print(f"Overall Average Accuracy: {sum(project_wise_accuracy) / len(project_wise_accuracy)}")
  print(f"Overall Average F1-Score: {sum(project_wise_f1) / len(project_wise_f1)}")