In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
data_dir = '/content/gdrive/My Drive/datasets/plag_data/'
plag_files_dir = os.path.join(data_dir, 'data')
plag_file_summary = os.path.join(data_dir, 'data', 'file_information.csv')

In [3]:
sample_text_row = 2
dt_max_leaf_nodes = 2

In [None]:
os.chdir(data_dir)

In [None]:
def download_extract_plag_dataset():
  print("Data not downloaded yet. Download process started...")
  !wget https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c4147f9_data/data.zip
  print("Downoading completed successfully...")
  print("Extraction process started...")
  !unzip -o data
  print("Extraction process completed...")

In [None]:
if not os.listdir(data_dir):
    download_extract_plag_dataset()
else:
    print("Data already downloaded. Skipping this step...")


Data already downloaded. Skipping this step...


In [None]:
!ls data

file_information.csv  g0pE_taska.txt  g2pA_taskb.txt  g3pA_taskc.txt  g4pC_taskd.txt
g0pA_taska.txt	      g0pE_taskb.txt  g2pA_taskc.txt  g3pA_taskd.txt  g4pC_taske.txt
g0pA_taskb.txt	      g0pE_taskc.txt  g2pA_taskd.txt  g3pA_taske.txt  g4pD_taska.txt
g0pA_taskc.txt	      g0pE_taskd.txt  g2pA_taske.txt  g3pB_taska.txt  g4pD_taskb.txt
g0pA_taskd.txt	      g0pE_taske.txt  g2pB_taska.txt  g3pB_taskb.txt  g4pD_taskc.txt
g0pA_taske.txt	      g1pA_taska.txt  g2pB_taskb.txt  g3pB_taskc.txt  g4pD_taskd.txt
g0pB_taska.txt	      g1pA_taskb.txt  g2pB_taskc.txt  g3pB_taskd.txt  g4pD_taske.txt
g0pB_taskb.txt	      g1pA_taskc.txt  g2pB_taskd.txt  g3pB_taske.txt  g4pE_taska.txt
g0pB_taskc.txt	      g1pA_taskd.txt  g2pB_taske.txt  g3pC_taska.txt  g4pE_taskb.txt
g0pB_taskd.txt	      g1pA_taske.txt  g2pC_taska.txt  g3pC_taskb.txt  g4pE_taskc.txt
g0pB_taske.txt	      g1pB_taska.txt  g2pC_taskb.txt  g3pC_taskc.txt  g4pE_taskd.txt
g0pC_taska.txt	      g1pB_taskb.txt  g2pC_taskc.txt  g3pC_taskd.txt  g4pE_t

In [None]:
plag_file_summary_df = pd.read_csv(plag_file_summary)
plag_file_summary_df.head(10)

Unnamed: 0,File,Task,Category
0,g0pA_taska.txt,a,non
1,g0pA_taskb.txt,b,cut
2,g0pA_taskc.txt,c,light
3,g0pA_taskd.txt,d,heavy
4,g0pA_taske.txt,e,non
5,g0pB_taska.txt,a,non
6,g0pB_taskb.txt,b,non
7,g0pB_taskc.txt,c,cut
8,g0pB_taskd.txt,d,light
9,g0pB_taske.txt,e,heavy


In [None]:
plag_file_summary_df.shape

(100, 3)

In [None]:
print('Number of files: ', plag_file_summary_df.shape[0])  # .shape[0] gives the rows
# .unique() gives unique items in a specified column
print('Number of unique tasks/question types (A-E): ', (len(plag_file_summary_df['Task'].unique())))
print('Unique plagiarism categories: ', (plag_file_summary_df['Category'].unique()))

Number of files:  100
Number of unique tasks/question types (A-E):  5
Unique plagiarism categories:  ['non' 'cut' 'light' 'heavy' 'orig']


### Data Preprocessing

#### First, we convert the categorical data to numerical data (task, category)

#### Below are the numerical labels for each fategory

1.   0 - non
2.   1 - heavy
3.   2 - light
4.   3 - cut
5.   -1 orig, this is indicates original file

In [None]:
def plag_numerical_dataframe(plag_categorical_df):
    # Map categorical values in 'Category' to numerical values
    category_mapping = {'non': 0, 'heavy': 1, 'light': 2, 'cut': 3, 'orig': -1}
    plag_categorical_df['Category'] = plag_categorical_df['Category'].replace(category_mapping)

    # Create the 'Class' column based on the 'Category' column
    plag_categorical_df['Class'] = plag_categorical_df['Category'].apply(lambda x: 1 if x > 0 else x)

    return plag_categorical_df

In [None]:
plag_numerical_df = plag_numerical_dataframe(plag_file_summary_df)
plag_numerical_df.head(10)

Unnamed: 0,File,Task,Category,Class
0,g0pA_taska.txt,a,0,0
1,g0pA_taskb.txt,b,3,1
2,g0pA_taskc.txt,c,2,1
3,g0pA_taskd.txt,d,1,1
4,g0pA_taske.txt,e,0,0
5,g0pB_taska.txt,a,0,0
6,g0pB_taskb.txt,b,0,0
7,g0pB_taskc.txt,c,3,1
8,g0pB_taskd.txt,d,2,1
9,g0pB_taske.txt,e,1,1


#### NLP Text Processing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer and stop words set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def process_text(file_content):
    processed_text = file_content.lower()

    # Remove all non-alphanumeric characters except spaces
    processed_text = re.sub(r"[^\w\s]", "", processed_text)
    tokens = word_tokenize(processed_text)
    # Lemmatize and remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text.strip()

#### Create the text column from the data files and create the final dataframe

In [None]:
def add_text_to_df(df, file_dir):
    def read_and_process_file(filename):
        file_path = os.path.join(file_dir, filename)
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                file_content = file.read()
                return process_text(file_content)
        except FileNotFoundError:
            return "File not found"

    df['Text'] = df['File'].apply(read_and_process_file)

    return df

In [None]:
plag_final_df = add_text_to_df(plag_numerical_df, plag_files_dir)
plag_final_df.head()

Unnamed: 0,File,Task,Category,Class,Text
0,g0pA_taska.txt,a,0,0,inheritance basic concept objectoriented progr...
1,g0pA_taskb.txt,b,3,1,pagerank link analysis algorithm used google i...
2,g0pA_taskc.txt,c,2,1,vector space model also called term vector mod...
3,g0pA_taskd.txt,d,1,1,bayes theorem name rev thomas bayes method use...
4,g0pA_taske.txt,e,0,0,dynamic programming algorithm design technique...


In [None]:
print('Sample processed text:\n\n', plag_numerical_df.iloc[sample_text_row]['Text'])

Sample processed text:

 vector space model also called term vector model algebraic model used represent text document well object general vector identifier used information retrieval first used smart information retrieval system document represented vector dimension corresponds separate term term appears document value vector nonzero many different way calculating value also known term weight developed one best known method called tfidf weighting definition term depends application generally term single word keywords longer phrase word chosen term dimensionality vector number word vocabulary number distinct word occurring corpus vector space model several disadvantage firstly long document represented badly poor similarity value secondly search keywords must accurately match document term substring word might result falsepositive match thirdly document similar context different term vocabulary associated resulting falsenegative match finally order term appear document lost vector spac

### Splitting the data into training and testing sets

In [None]:
def assign_data_usage(df, condition, train_ratio, seed):
    condition_indices = np.where(condition)[0]
    train_size = int(len(condition_indices) * train_ratio)

    data_usage_labels = np.array(['test'] * len(condition_indices), dtype='<U5')

    train_indices = np.random.RandomState(seed).choice(condition_indices, train_size, replace=False)

    position_map = {index: pos for pos, index in enumerate(condition_indices)}
    mapped_train_indices = [position_map[idx] for idx in train_indices]

    data_usage_labels[mapped_train_indices] = 'train'

    return data_usage_labels

In [None]:
def train_test_dataframe(clean_df, random_seed=100):
    new_df = clean_df.copy()

    # Default to 'orig' for original texts
    new_df['data_usage'] = 'orig'

    # Conditions for plagiarized and non-plagiarized (non-original)
    conditions = {
        'train_test': (new_df['Category'] > 0),  # Plagiarized answers
        'non_plagiarized': (new_df['Category'] == 0)  # Non-plagiarized answers
    }

    # Assign train/test for plagiarized answers
    new_df.loc[conditions['train_test'], 'data_usage'] = assign_data_usage(new_df, conditions['train_test'], 0.8, random_seed)

    # Assign train/test for non-plagiarized answers
    new_df.loc[conditions['non_plagiarized'], 'data_usage'] = assign_data_usage(new_df, conditions['non_plagiarized'], 0.8, random_seed)

    return new_df

In [None]:
plag_completed_df = train_test_dataframe(plag_final_df)
plag_completed_df.head(10)

Unnamed: 0,File,Task,Category,Class,Text,data_usage
0,g0pA_taska.txt,a,0,0,inheritance basic concept objectoriented progr...,train
1,g0pA_taskb.txt,b,3,1,pagerank link analysis algorithm used google i...,train
2,g0pA_taskc.txt,c,2,1,vector space model also called term vector mod...,train
3,g0pA_taskd.txt,d,1,1,bayes theorem name rev thomas bayes method use...,test
4,g0pA_taske.txt,e,0,0,dynamic programming algorithm design technique...,train
5,g0pB_taska.txt,a,0,0,inheritance basic concept object oriented prog...,train
6,g0pB_taskb.txt,b,0,0,pagerank pr refers concept google system used ...,test
7,g0pB_taskc.txt,c,3,1,vector space model algebraic model representin...,test
8,g0pB_taskd.txt,d,2,1,bayes theorem relates conditional marginal pro...,train
9,g0pB_taske.txt,e,1,1,dynamic programming method solving mathematica...,train


#### Feature Engineering

##### Calculation of containment features

In [None]:
def containment_calculation(df, n, answer_filename):
    # Extract answer and source texts
    answer_text = df[df["File"] == answer_filename]['Text'].values[0]
    orig_filename = 'orig' + answer_filename[4:]
    source_text = df[df["File"] == orig_filename]['Text'].values[0]

    # Calculate ngrams
    counts = CountVectorizer(analyzer='word', ngram_range=(n, n))
    ngrams = counts.fit_transform([answer_text, source_text]).toarray()

    # Calculate containment
    containment = np.sum(np.minimum(ngrams[0], ngrams[1])) / np.sum(ngrams[0])

    return containment

##### Longest Common Subsequence

In [None]:
def lcs_norm_word(answer_text, source_text):
    # Split the input texts into lists of words
    answer_words = answer_text.split()
    source_words = source_text.split()

    # Initialize dimensions
    m, n = len(answer_words), len(source_words)

    # Initialize the matrix for dynamic programming
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Fill the DP matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if answer_words[i - 1] == source_words[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

    # Calculate the normalized LCS value
    lcs_length = dp[m][n]
    normalized_lcs = lcs_length / m if m != 0 else 0.0

    return normalized_lcs

##### Create containment features

In [None]:
def containment_features_creation(df, n, column_name=None):
    containment_values = []

    # Default column name if not provided
    if column_name is None:
        column_name = f'c_{n}'

    # Iterate through DataFrame rows
    for i, row in df.iterrows():
        file = row['File']

        # Compute features using calculate_containment function
        if row['Category'] > -1:
            c = containment_calculation(df, n, file)
            containment_values.append(c)
        # Set value to -1 for original tasks
        else:
            containment_values.append(-1)

    print(f'{n}-gram containment features created!')
    return containment_values


##### LCS Features Creation

In [None]:
def lcs_features_creation(df, column_name='lcs_word'):
    lcs_values = []

    # Iterate through files in the DataFrame
    for _, row in df.iterrows():
        if row['Category'] > -1:
            answer_text = row['Text']
            task = row['Task']
            orig_row = df[(df['Class'] == -1) & (df['Task'] == task)].iloc[0]
            source_text = orig_row['Text']

            # Calculate LCS
            lcs = lcs_norm_word(answer_text, source_text)
            lcs_values.append(lcs)
        else:
            lcs_values.append(-1)

    print('LCS features created!')
    return lcs_values


In [None]:
# Define an ngram range
ngram_range = range(1, 17)

features_list = []

# Create features in a features_df
all_features = []

# Calculate features for containment for ngrams in the range
for n in ngram_range:
    column_name = f'c_{n}'
    features_list.append(column_name)
    containment_values = containment_features_creation(plag_completed_df, n, column_name)
    all_features.append(containment_values)

lcs_values = lcs_features_creation(plag_completed_df)
features_list.append('lcs_word')
all_features.append(lcs_values)

# Create a features DataFrame
feature_ext_df = pd.DataFrame(np.transpose(all_features), columns=features_list)

# Print all features/columns
print('\n Created Features:', features_list, '\n')


1-gram containment features created!
2-gram containment features created!
3-gram containment features created!
4-gram containment features created!
5-gram containment features created!
6-gram containment features created!
7-gram containment features created!
8-gram containment features created!
9-gram containment features created!
10-gram containment features created!
11-gram containment features created!
12-gram containment features created!
13-gram containment features created!
14-gram containment features created!
15-gram containment features created!
16-gram containment features created!
LCS features created!

 Created Features: ['c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'c_7', 'c_8', 'c_9', 'c_10', 'c_11', 'c_12', 'c_13', 'c_14', 'c_15', 'c_16', 'lcs_word'] 



In [None]:
feature_ext_df.head(10)

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,c_13,c_14,c_15,c_16,lcs_word
0,0.241935,0.03252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137097
1,1.0,0.973684,0.938053,0.901786,0.864865,0.827273,0.788991,0.75,0.71028,0.669811,0.628571,0.586538,0.543689,0.5,0.455446,0.41,0.82906
2,0.837209,0.632812,0.472441,0.373016,0.304,0.258065,0.219512,0.188525,0.157025,0.133333,0.117647,0.101695,0.08547,0.077586,0.069565,0.061404,0.821705
3,0.495495,0.227273,0.110092,0.064815,0.046729,0.028302,0.009524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324324
4,0.447368,0.097345,0.008929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254237
5,0.206452,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083871
6,0.418605,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224806
7,0.784091,0.708571,0.649425,0.589595,0.52907,0.473684,0.429412,0.402367,0.375,0.353293,0.331325,0.309091,0.286585,0.269939,0.253086,0.236025,0.625
8,0.669811,0.419048,0.278846,0.184466,0.117647,0.059406,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.456897
9,0.825758,0.465649,0.284615,0.20155,0.140625,0.102362,0.071429,0.048,0.032258,0.01626,0.008197,0.0,0.0,0.0,0.0,0.0,0.6


##### Feature Correlation

In [None]:
features_correlation_matrix = feature_ext_df.corr().abs().round(2)
display(features_correlation_matrix)

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,c_13,c_14,c_15,c_16,lcs_word
c_1,1.0,0.96,0.93,0.91,0.89,0.88,0.88,0.87,0.87,0.87,0.86,0.86,0.86,0.86,0.86,0.86,0.98
c_2,0.96,1.0,0.99,0.98,0.96,0.95,0.94,0.93,0.92,0.91,0.9,0.89,0.89,0.88,0.88,0.87,0.98
c_3,0.93,0.99,1.0,1.0,0.99,0.98,0.97,0.96,0.95,0.94,0.93,0.93,0.92,0.91,0.9,0.9,0.96
c_4,0.91,0.98,1.0,1.0,1.0,0.99,0.99,0.98,0.97,0.96,0.96,0.95,0.94,0.93,0.93,0.92,0.94
c_5,0.89,0.96,0.99,1.0,1.0,1.0,0.99,0.99,0.99,0.98,0.97,0.97,0.96,0.95,0.95,0.94,0.92
c_6,0.88,0.95,0.98,0.99,1.0,1.0,1.0,1.0,0.99,0.99,0.98,0.98,0.97,0.96,0.96,0.95,0.91
c_7,0.88,0.94,0.97,0.99,0.99,1.0,1.0,1.0,1.0,0.99,0.99,0.98,0.98,0.97,0.97,0.96,0.9
c_8,0.87,0.93,0.96,0.98,0.99,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.98,0.98,0.97,0.9
c_9,0.87,0.92,0.95,0.97,0.99,0.99,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.98,0.98,0.89
c_10,0.87,0.91,0.94,0.96,0.98,0.99,0.99,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.99,0.88


#### Traina nd test data creation

In [None]:
def generate_train_test_data(complete_df, features_df, good_features):
    # Merge the complete_df and features_df
    df = pd.concat([plag_completed_df, feature_ext_df], axis=1)

    # Get the training features and labels
    x_train = df.loc[df['data_usage'] == 'train', good_features].values
    y_train = df.loc[df['data_usage'] == 'train', 'Class'].values

    # Get the test features and labels
    test_x = df.loc[df['data_usage'] == 'test', good_features].values
    test_y = df.loc[df['data_usage'] == 'test', 'Class'].values

    return (x_train, y_train), (test_x, test_y)


In [None]:
features_correlation_matrix

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,c_13,c_14,c_15,c_16,lcs_word
c_1,1.0,0.96,0.93,0.91,0.89,0.88,0.88,0.87,0.87,0.87,0.86,0.86,0.86,0.86,0.86,0.86,0.98
c_2,0.96,1.0,0.99,0.98,0.96,0.95,0.94,0.93,0.92,0.91,0.9,0.89,0.89,0.88,0.88,0.87,0.98
c_3,0.93,0.99,1.0,1.0,0.99,0.98,0.97,0.96,0.95,0.94,0.93,0.93,0.92,0.91,0.9,0.9,0.96
c_4,0.91,0.98,1.0,1.0,1.0,0.99,0.99,0.98,0.97,0.96,0.96,0.95,0.94,0.93,0.93,0.92,0.94
c_5,0.89,0.96,0.99,1.0,1.0,1.0,0.99,0.99,0.99,0.98,0.97,0.97,0.96,0.95,0.95,0.94,0.92
c_6,0.88,0.95,0.98,0.99,1.0,1.0,1.0,1.0,0.99,0.99,0.98,0.98,0.97,0.96,0.96,0.95,0.91
c_7,0.88,0.94,0.97,0.99,0.99,1.0,1.0,1.0,1.0,0.99,0.99,0.98,0.98,0.97,0.97,0.96,0.9
c_8,0.87,0.93,0.96,0.98,0.99,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.98,0.98,0.97,0.9
c_9,0.87,0.92,0.95,0.97,0.99,0.99,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.98,0.98,0.89
c_10,0.87,0.91,0.94,0.96,0.98,0.99,0.99,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.99,0.88


In [None]:
good_features = ['c_1', 'c_16', 'lcs_word']

(x_train, y_train), (x_test, y_test) = generate_train_test_data(plag_completed_df, feature_ext_df, good_features)

print('Training size: ', len(x_train))
print('Testing size: ', len(x_test))
print()
print('\n'.join([str(sample) for sample in x_train[:10]]))

Training size:  75
Testing size:  20

[0.24193548 0.         0.13709677]
[1.         0.41       0.82905983]
[0.8372093  0.06140351 0.82170543]
[0.44736842 0.         0.25423729]
[0.20645161 0.         0.08387097]
[0.66981132 0.         0.45689655]
[0.82575758 0.         0.6       ]
[0.52427184 0.         0.42307692]
[0.35051546 0.         0.21649485]
[0.36046512 0.         0.1744186 ]


#### Model Creation

##### 1. Decsion Tree

In [None]:
dt_model = DecisionTreeClassifier(max_leaf_nodes=dt_max_leaf_nodes)

##### 2. Support Vector Machine (SVM)

In [None]:
svm_model = SVC()

##### 3. Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=50)

##### Model Evaluation Engine

In [None]:
def evaluate_model(model, model_name, x_train, y_train, x_test, y_test):
    # Train the model
    model.fit(x_train, y_train)

    # Predict on the test set
    y_pred = model.predict(x_test)

    print(model_name)
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy Score: {round(accuracy*100, 2)}%')

    # Display classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    return accuracy

##### Evaluate individual models

In [None]:
from prettytable import PrettyTable
def display_model_results(model_accuracies):
    table = PrettyTable()
    table.field_names = ["Model Name", "Accuracy"]

    # Add rows to the table
    for name, accuracy in model_accuracies:
        table.add_row([name, f"{round(accuracy * 100, 2)}%"])

    # Print the table
    print(table)

model_accuracies = []
models = {
    'Decision Tree': dt_model,
    'Support Vector Machine': svm_model,
    'Random Forest': rf_model
}

for model_name, model in models.items():
    accuracy = evaluate_model(model, model_name, x_train, y_train, x_test, y_test)
    model_accuracies.append((model_name, accuracy))

display_model_results(model_accuracies)

Decision Tree
Accuracy Score: 95.0%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.92      1.00      0.96        12

    accuracy                           0.95        20
   macro avg       0.96      0.94      0.95        20
weighted avg       0.95      0.95      0.95        20

Confusion Matrix:
[[ 7  1]
 [ 0 12]]
Support Vector Machine
Accuracy Score: 90.0%
Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         8
           1       1.00      0.83      0.91        12

    accuracy                           0.90        20
   macro avg       0.90      0.92      0.90        20
weighted avg       0.92      0.90      0.90        20

Confusion Matrix:
[[ 8  0]
 [ 2 10]]
Random Forest
Accuracy Score: 95.0%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.