# This program attempts to predict the mutation score 

In [4]:
from sklearn import datasets
import pandas as pd

## Read DataFrame

In [5]:
df = pd.read_json("../old_output_with_functions.json")
list(df.columns.values)

['file',
 'pattern',
 'check_id',
 'pattern_detailes',
 'function_name',
 'function_scope',
 'mutants',
 'mutation_score']

**Dropping all mutation_score with 'NA' values**

In [6]:
df = df.dropna()

In [7]:
df

Unnamed: 0,file,pattern,check_id,pattern_detailes,function_name,function_scope,mutants,mutation_score
1,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 21, 'coloffset': 0, 'linematch': 'd...",bubble_sort,21-38,"[{'name': 'Mutant #1', 'line': 24, 'descriptio...",70.588235
2,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 41, 'coloffset': 0, 'linematch': 'd...",insertion_sort,41-60,"[{'name': 'Mutant #18', 'line': 44, 'descripti...",44.444444
3,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 63, 'coloffset': 0, 'linematch': 'd...",merge,63-93,"[{'name': 'Mutant #36', 'line': 67, 'descripti...",62.500000
4,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 96, 'coloffset': 0, 'linematch': 'd...",merge_sort,96-101,"[{'name': 'Mutant #60', 'line': 98, 'descripti...",60.000000
5,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 104, 'coloffset': 0, 'linematch': '...",quick_sort,104-118,"[{'name': 'Mutant #65', 'line': 106, 'descript...",50.000000
...,...,...,...,...,...,...,...,...
186,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,//FunctionDef[@type='str']/body/* | //Function...,[CML001],"{'lineno': 64, 'coloffset': 4, 'linematch': 'a...",listsorting,45-78,"[{'name': 'Mutant #183', 'line': 46, 'descript...",0.000000
187,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,//FunctionDef[@type='str']/body/* | //Function...,[CML001],"{'lineno': 65, 'coloffset': 4, 'linematch': 'r...",listsorting,45-78,"[{'name': 'Mutant #183', 'line': 46, 'descript...",0.000000
188,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,//FunctionDef[@type='str']/body/* | //Function...,[CML001],"{'lineno': 68, 'coloffset': 4, 'linematch': 'd...",listsorting,45-78,"[{'name': 'Mutant #183', 'line': 46, 'descript...",0.000000
189,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,//FunctionDef[@type='str']/body/* | //Function...,[CML001],"{'lineno': 77, 'coloffset': 4, 'linematch': 't...",listsorting,45-78,"[{'name': 'Mutant #183', 'line': 46, 'descript...",0.000000


In [21]:
average_mutation_score = df.groupby('pattern')['mutation_score'].mean()

In [18]:
result_df = pd.DataFrame({'check_id': average_mutation_score.index, 'average_mutation_score': average_mutation_score.values})
print(result_df)

                                             check_id  average_mutation_score
0                                      .//FunctionDef               26.099799
1                             .//FunctionDef/body//If               57.975113
2       .//FunctionDef[not(contains(@name, "test_"))]               26.099799
3                 //Compare/ops/Is | //Compare/ops/Eq               48.333333
4                          //FunctionDef//For[.//For]               70.588235
5                    //FunctionDef//If/descendant::If               50.000000
6   //FunctionDef//If/following-sibling::If | //Fu...               62.500000
7   //FunctionDef[//(If/following-sibling::For | F...               48.471055
8   //FunctionDef[@type='str']/body/* | //Function...               24.411033
9      //FunctionDef[body//comprehension/target/Name]                0.000000
10  //FunctionDef[not(args/arg/annotation) or not(...               26.099799


## Data Preprocessing

* Label Encoding 'pattern'
* Categorize 'mutation_score'

In [8]:
#use LabelEncoding to encode all string data

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['pattern'])

In [9]:
for idx, row in df.iterrows():
  pattern = row['pattern']
  encoded_pattern = le.transform([pattern])[0]
  df.loc[idx, 'pattern_encoded'] = encoded_pattern

**Unique patterns and their encoded labels**

In [10]:
unique_patterns = df.groupby('pattern')['pattern_encoded'].first().reset_index()

# Loop through unique patterns and print each on a single line
for index, row in unique_patterns.iterrows():
  pattern = row['pattern']
  encoded_label = row['pattern_encoded']
  print(f"{pattern}: {encoded_label}")

.//FunctionDef: 0.0
.//FunctionDef/body//If: 1.0
.//FunctionDef[not(contains(@name, "test_"))]: 2.0
//Compare/ops/Is | //Compare/ops/Eq: 3.0
//FunctionDef//For[.//For]: 4.0
//FunctionDef//If/descendant::If: 5.0
//FunctionDef//If/following-sibling::If | //FunctionDef//If/following-sibling::Elif | //FunctionDef//If/following-sibling::Else: 6.0
//FunctionDef[//(If/following-sibling::For | For/following-sibling::If)]: 7.0
//FunctionDef[@type='str']/body/* | //FunctionDef[@type='str']/body/Return: 8.0
//FunctionDef[body//comprehension/target/Name]: 9.0
//FunctionDef[not(args/arg/annotation) or not(returns)]: 10.0


In [11]:
# catagorize mutation score 
low_threshold = 30
medium_threshold = 60

def map_to_category(score):
    """Maps a mutation score to a category (high, medium,low)"""
    if score < low_threshold:
        return "low"
    elif score < medium_threshold:
        return "medium"
    else:
        return "high"

In [12]:
df = df.assign(mutation_category = df['mutation_score'].apply(map_to_category))

**Current Dataset**

In [13]:
df

Unnamed: 0,file,pattern,check_id,pattern_detailes,function_name,function_scope,mutants,mutation_score,pattern_encoded,mutation_category
1,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 21, 'coloffset': 0, 'linematch': 'd...",bubble_sort,21-38,"[{'name': 'Mutant #1', 'line': 24, 'descriptio...",70.588235,0.0,high
2,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 41, 'coloffset': 0, 'linematch': 'd...",insertion_sort,41-60,"[{'name': 'Mutant #18', 'line': 44, 'descripti...",44.444444,0.0,medium
3,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 63, 'coloffset': 0, 'linematch': 'd...",merge,63-93,"[{'name': 'Mutant #36', 'line': 67, 'descripti...",62.500000,0.0,high
4,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 96, 'coloffset': 0, 'linematch': 'd...",merge_sort,96-101,"[{'name': 'Mutant #60', 'line': 98, 'descripti...",60.000000,0.0,high
5,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,.//FunctionDef,[F001],"{'lineno': 104, 'coloffset': 0, 'linematch': '...",quick_sort,104-118,"[{'name': 'Mutant #65', 'line': 106, 'descript...",50.000000,0.0,medium
...,...,...,...,...,...,...,...,...,...,...
186,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,//FunctionDef[@type='str']/body/* | //Function...,[CML001],"{'lineno': 64, 'coloffset': 4, 'linematch': 'a...",listsorting,45-78,"[{'name': 'Mutant #183', 'line': 46, 'descript...",0.000000,8.0,low
187,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,//FunctionDef[@type='str']/body/* | //Function...,[CML001],"{'lineno': 65, 'coloffset': 4, 'linematch': 'r...",listsorting,45-78,"[{'name': 'Mutant #183', 'line': 46, 'descript...",0.000000,8.0,low
188,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,//FunctionDef[@type='str']/body/* | //Function...,[CML001],"{'lineno': 68, 'coloffset': 4, 'linematch': 'd...",listsorting,45-78,"[{'name': 'Mutant #183', 'line': 46, 'descript...",0.000000,8.0,low
189,/Users/jaclynpham/AstuteSource/SEERS/scripts/a...,//FunctionDef[@type='str']/body/* | //Function...,[CML001],"{'lineno': 77, 'coloffset': 4, 'linematch': 't...",listsorting,45-78,"[{'name': 'Mutant #183', 'line': 46, 'descript...",0.000000,8.0,low


In [94]:
#Counts of mutation score categories
category_counts = df['mutation_category'].value_counts()
print(category_counts)

mutation_category
low       62
high      60
medium    17
Name: count, dtype: int64


## Train Test Split

In [137]:
#independent and dependent variables
X=df[['pattern_encoded']]
y=df['mutation_category']

In [138]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.30, random_state=42)

## Training of Model

In [131]:
#importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [143]:
#creating a RF classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)


In [144]:
clf.fit(X_train,y_train)

In [148]:
y_pred = clf.predict(X_test)
print(y_pred)

['low' 'high' 'high' 'low' 'medium' 'low' 'low' 'high' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'high' 'low' 'high' 'low' 'low' 'high' 'low'
 'low' 'low' 'low' 'high' 'low' 'high' 'low' 'low' 'low' 'low' 'low'
 'high' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'high']


In [147]:
print("All actual vs. predicted categories:")
for i in range(len(y_test)):
  print(f"Sample {i+1}: Actual - {y_test.iloc[i]}, Predicted - {y_pred[i]}")

All actual vs. predicted categories:
Sample 1: Actual - low, Predicted - low
Sample 2: Actual - high, Predicted - high
Sample 3: Actual - high, Predicted - high
Sample 4: Actual - low, Predicted - low
Sample 5: Actual - medium, Predicted - medium
Sample 6: Actual - low, Predicted - low
Sample 7: Actual - low, Predicted - low
Sample 8: Actual - medium, Predicted - high
Sample 9: Actual - low, Predicted - low
Sample 10: Actual - low, Predicted - low
Sample 11: Actual - low, Predicted - low
Sample 12: Actual - low, Predicted - low
Sample 13: Actual - high, Predicted - low
Sample 14: Actual - low, Predicted - low
Sample 15: Actual - high, Predicted - low
Sample 16: Actual - medium, Predicted - high
Sample 17: Actual - low, Predicted - low
Sample 18: Actual - high, Predicted - high
Sample 19: Actual - low, Predicted - low
Sample 20: Actual - low, Predicted - low
Sample 21: Actual - medium, Predicted - high
Sample 22: Actual - low, Predicted - low
Sample 23: Actual - medium, Predicted - low


#confusion matrix

how many times it predicts accurately?

## Evaluating model performance

In [150]:
from sklearn.metrics import accuracy_score

In [151]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy:.4f}")

Model accuracy on test set: 0.5952


**Note:**
At the moment, the model accuracy on the test set indicates that it's not consistently accurate across all data points. Here are some reasons I suspect: 
* Limited Data
* Feature Selection
* Model Complexity

