In [126]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from joblib import dump
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [127]:
data = pd.read_excel("./code_plagiarism.xls")
data

Unnamed: 0,File Name 1,File Name 2,Per_Functions,Per_Loops,Per_Conditionals,Per_Arithmetic_Operations,Per_int_Declarations,Per_float_Declarations,Per_char_Declarations,Jaccard_Metric,Euclidean_Distance,label
0,000.c,001.c,0.518519,0.166667,0.250000,0.000000,0.166667,0,0.018182,0.818182,0.096436,0
1,000.c,002.c,0.928571,0.333333,0.500000,0.000000,0.250000,0,0.250000,0.818182,0.224555,0
2,000.c,003.c,0.875000,1.000000,1.000000,0.000000,0.500000,0,0.000000,0.777778,0.590838,0
3,000.c,004.c,0.785714,0.000000,0.500000,0.000000,0.000000,0,0.000000,0.555556,0.397426,0
4,000.c,005.c,0.714286,0.333333,0.500000,0.000000,0.200000,0,0.142857,0.666667,0.207626,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2998,074.c,076.c,0.262500,0.333333,0.225806,0.107143,0.333333,0,0.224138,1.000000,0.285272,0
2999,074.c,077.c,0.412500,0.222222,0.548387,0.285714,0.227273,0,0.149425,1.000000,0.268711,0
3000,075.c,076.c,0.552632,0.166667,0.500000,0.000000,0.909091,0,0.155172,0.916667,0.263410,0
3001,075.c,077.c,0.868421,0.111111,0.823529,0.000000,0.083333,0,0.103448,0.916667,0.258249,0


In [128]:
non_zero_cells_per_column = data.astype(bool).sum(axis=0)
percentages_non_zero_cells_per_column = (non_zero_cells_per_column * 100) / (data.shape[0])
percentages_non_zero_cells_per_column

File Name 1                  100.000000
File Name 2                  100.000000
Per_Functions                100.000000
Per_Loops                     73.626374
Per_Conditionals              87.512488
Per_Arithmetic_Operations     40.792541
Per_int_Declarations          67.132867
Per_float_Declarations         0.000000
Per_char_Declarations         53.146853
Jaccard_Metric               100.000000
Euclidean_Distance           100.000000
label                          0.865801
dtype: float64

In [129]:
# Obtaining the columns that have more than 90% of null values
columns_to_drop = percentages_non_zero_cells_per_column < 10
columns_to_drop = columns_to_drop[columns_to_drop]
columns_to_drop = columns_to_drop.index
columns_to_drop = columns_to_drop.drop("label")
columns_to_drop

Index(['Per_float_Declarations'], dtype='object')

In [130]:
clean_data = data.drop(columns_to_drop, axis = 1)
clean_data

Unnamed: 0,File Name 1,File Name 2,Per_Functions,Per_Loops,Per_Conditionals,Per_Arithmetic_Operations,Per_int_Declarations,Per_char_Declarations,Jaccard_Metric,Euclidean_Distance,label
0,000.c,001.c,0.518519,0.166667,0.250000,0.000000,0.166667,0.018182,0.818182,0.096436,0
1,000.c,002.c,0.928571,0.333333,0.500000,0.000000,0.250000,0.250000,0.818182,0.224555,0
2,000.c,003.c,0.875000,1.000000,1.000000,0.000000,0.500000,0.000000,0.777778,0.590838,0
3,000.c,004.c,0.785714,0.000000,0.500000,0.000000,0.000000,0.000000,0.555556,0.397426,0
4,000.c,005.c,0.714286,0.333333,0.500000,0.000000,0.200000,0.142857,0.666667,0.207626,0
...,...,...,...,...,...,...,...,...,...,...,...
2998,074.c,076.c,0.262500,0.333333,0.225806,0.107143,0.333333,0.224138,1.000000,0.285272,0
2999,074.c,077.c,0.412500,0.222222,0.548387,0.285714,0.227273,0.149425,1.000000,0.268711,0
3000,075.c,076.c,0.552632,0.166667,0.500000,0.000000,0.909091,0.155172,0.916667,0.263410,0
3001,075.c,077.c,0.868421,0.111111,0.823529,0.000000,0.083333,0.103448,0.916667,0.258249,0


In [131]:
X = clean_data.drop(["File Name 1", "File Name 2", "label"], axis = 1)
y = clean_data["label"]

In [132]:
over = SMOTE(sampling_strategy=0.07)
under = RandomUnderSampler(sampling_strategy=0.35)

In [133]:
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [134]:
X, y = pipeline.fit_resample(X, y)

In [135]:
y.value_counts()

label
0    594
1    208
Name: count, dtype: int64

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [137]:
random_forest = RandomForestClassifier(max_depth=2, random_state=0)
random_forest.fit(X_train, y_train)

In [138]:
print("Score:")
print(random_forest.score(X_test, y_test))

Score:
0.8981132075471698


In [139]:
y_pred = random_forest.predict(X_test)


In [140]:
dump(random_forest, './random_forest.joblib')

['./random_forest.joblib']

In [141]:
y_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0])

In [142]:
y_test.value_counts()

label
0    203
1     62
Name: count, dtype: int64