In [1]:
# Section 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # Import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
import pickle
import os

In [2]:
# Section 2: Load Extracted Features Dataset
df = pd.read_csv('extracted_features.csv')
print(df.head())
print(df.describe())
# Prepare feature matrix and labels
X = df.drop(columns=['label','filename']).values
y = df['label'].values

                                filename   label  bow_abandon  bow_aber  \
0  preprocessed_source-document00086.txt  source            0         0   
1  preprocessed_source-document00087.txt  source           21         0   
2  preprocessed_source-document00088.txt  source            2         0   
3  preprocessed_source-document00089.txt  source            2         0   
4  preprocessed_source-document00090.txt  source            0         0   

   bow_abide  bow_ability  bow_able  bow_abner  bow_abode  bow_abound  ...  \
0          0            0         0          0          0           0  ...   
1          6            1        11          0          2           3  ...   
2          0            0         3          0          0           2  ...   
3          1            2         9          0          0           0  ...   
4          0            1         1          0          0           0  ...   

   bert_374  bert_375  bert_376  bert_377  bert_378  bert_379  bert_380  \
0  0.

In [44]:
# Section 3: Initialize Random Forest Model
class RandomForestPlagiarismDetector:
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=50, max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=42)
        self.scaler = StandardScaler()

    def train(self, X_train, y_train):
        # Scale features
        self.scaler.fit(X_train)
        X_train_scaled = self.scaler.transform(X_train)

        # Train the Random Forest model
        self.model.fit(X_train_scaled, y_train)

    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)

    def predict_proba(self, X):
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)

In [45]:
# Section 4: Train the Model
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

detector = RandomForestPlagiarismDetector()
detector.train(X_train, y_train) # No need for X_val, y_val during training in this simplified model

val_preds = detector.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_acc:.4f}")

Validation Accuracy: 0.6500


In [46]:
# Section 5: Evaluate Model Performance
preds = detector.predict(X_val)
print(classification_report(y_val, preds))
cm = confusion_matrix(y_val, preds)
print('Confusion Matrix:\n', cm)

# Calculate F1-score for comparison
f1_rf = f1_score(y_val, preds, average='weighted')
print(f"Random Forest F1-score: {f1_rf:.4f}")

              precision    recall  f1-score   support

      source       0.65      0.65      0.65        20
  suspicious       0.65      0.65      0.65        20

    accuracy                           0.65        40
   macro avg       0.65      0.65      0.65        40
weighted avg       0.65      0.65      0.65        40

Confusion Matrix:
 [[13  7]
 [ 7 13]]
Random Forest F1-score: 0.6500
