# Decision Tree Classifier for a plagiarism detector

## Building the data dataframe

In [128]:
# Load modules and packages
import pandas as pd
import numpy as np

df = pd.read_csv("../data/train_scores.csv")

print(f"Plagiarized = {len(df[df['plagiarized'] == 1])}")
print(f"Non-plagiarized = {len(df[df['plagiarized'] == 0])}")
df.head()

Plagiarized = 685
Non-plagiarized = 331315


Unnamed: 0,src,sus,3-tok-lem-ngram,3-tok-lower-stop-alpha-ngram,3-tok-ngram,plagiarized
0,source-document00094.txt,suspicious-document00019.txt,0.0,0.0,0.0,0
1,source-document00029.txt,suspicious-document00019.txt,0.003139,0.0,0.002788,0
2,source-document00095.txt,suspicious-document00019.txt,0.003399,0.0,0.003399,0
3,source-document00081.txt,suspicious-document00019.txt,0.00154,0.0,0.001368,0
4,source-document00005.txt,suspicious-document00019.txt,0.001892,0.0,0.001734,0


In [2]:
from bokeh.plotting import *

# Tell bokeh where to output
output_notebook()

In [129]:
from sklearn.model_selection import train_test_split

# Balance dataframe by undersampling, we assume naively that they should be in equal
n = len(df[df['plagiarized'] == 1])
data = df[df['plagiarized'] == 1].append(df[df['plagiarized'] == 0].sample(n = n))
data.reset_index(inplace = True, drop = True)

# Select data to model
#features = ["jaccard", "containment", "dep"]
X = data.drop(["src", "sus", "plagiarized"], axis=1, errors="ignore")
y = data["plagiarized"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [121]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import numpy as np

def eval_model_table(X, y, clf):
    # Make folds of data for cross validation
    k_fold = StratifiedKFold(n_splits = 5, shuffle = True)

    results = []
    for train, test in k_fold.split(X, y):
        clf.fit(X.iloc[train], y.iloc[train])
        sc = score(y.iloc[test], clf.predict(X.iloc[test]), average = None, labels = [0, 1])
        results.append(np.stack(sc, axis = 0))

    avg_results = sum(results) / len(results)
    std_results = [np.abs(x - avg_results)**2 for x in results]
    std_results = np.sqrt(sum(std_results) / (len(std_results) * np.sqrt(len(std_results))))
    
    # Calculation of scores
    scores = pd.DataFrame(np.concatenate((avg_results, std_results), axis = 1))
    scores.columns = ["Avg. non-plagiarized", "Avg. plagiarized", "Std. non-plagiarized", "Std. plagiarized"]
    scores.insert(0, "Score type", ["precision", "recall", "F1", "support"])
    scores.set_index("Score type", inplace = True)
    
    return(scores)

# Init classifier, maybe SV could be useful?

pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', svm.SVC())])
grid = GridSearchCV(pipe, 
                    param_grid = {"clf__gamma" : np.linspace(0.1, 1, 100), "clf__C" : np.linspace(1, 200, 100), "clf__kernel": ["rbf"]},
                    cv = 5,
                    refit = True,
                    scoring = "f1")
grid.fit(X, y)
eval_model_table(X, y, grid.best_estimator_)
#pd.DataFrame(clf.cv_results)

Unnamed: 0_level_0,Avg. non-plagiarized,Avg. plagiarized,Std. non-plagiarized,Std. plagiarized
Score type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
precision,0.78042,0.900651,0.016964,0.018432
recall,0.916788,0.740146,0.01842,0.028058
F1,0.842466,0.811329,0.007676,0.013418
support,137.0,137.0,0.0,0.0


In [None]:
pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier())])

param_grid = {"clf__max_depth" : list(range(2, 20, 1)), 
              "clf__n_estimators" : list(range(50, 250, 5)), 
              "clf__min_samples_leaf": list(range(2, 20, 1)),
              "clf__min_samples_split": list(range(2, 20, 1))}

grid = GridSearchCV(pipe, 
                    param_grid = param_grid,
                    cv = 5,
                    refit = True,
                    scoring = "f1")
grid.fit(X, y)
eval_model_table(X, y, grid.best_estimator_)

In [None]:
import pickle
# We improved a bit, lets save it and work with this one
with open("../models/ngram_tok.model", "wb") as f:
    pickle.dump(clf, f)

In [None]:
clf.predict(X)

In [None]:
clf.predict_proba(X)