In [1]:
import pandas as pd
import pickle
import numpy as np
from imblearn import combine, over_sampling, under_sampling
from imblearn import pipeline
from sklearn import ensemble, metrics
pd.options.mode.chained_assignment = None

In [2]:
#load the CodeT5 embedding dataset
pickleFile = open("CodeT5_embeddings_LM.pkl", 'rb')
df = pickle.load(pickleFile)

In [3]:
#load the test set from the metrics experiment so that both models are compared on the same test set
pickleFile = open("../Dataset/Test_set_LM.pkl", 'rb')
df_test_metrics = pickle.load(pickleFile)

df['name'] = df['name'].apply(lambda x: x.split('.')[0])

#split the CodeT5 embedding dataset using the metric test set ids
df_test = df.loc[df['name'].astype(int).isin(df_test_metrics.index)]
df_train = df.iloc[~df.index.isin(df_test.index)]

df_train.dropna(subset = ["embedding"], inplace=True)
df_test.dropna(subset = ["embedding"], inplace=True)

In [4]:
X_train = np.array([row for row in df_train['embedding']])
y_train = np.array(df_train['label'])
X_test = np.array([row for row in df_test['embedding']])
y_test = np.array(df_test['label'])

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
sc = std_scaler.fit(X_train)
X_train = sc.transform(X_train)
X_test= sc.transform(X_test)

In [6]:
rf = ensemble.RandomForestClassifier(
    n_estimators=140,
    max_features="auto",
    min_samples_split=8,
    min_samples_leaf=1,
    bootstrap=False,
    criterion="gini",
    random_state=42,
    n_jobs=-1
)

sampler = combine.SMOTEENN(
    random_state=42,
    smote=over_sampling.SMOTE(
        sampling_strategy=0.5,
        random_state=42),
    enn=under_sampling.EditedNearestNeighbours(
        sampling_strategy="majority",
        kind_sel="mode"
    )
)

best_model = pipeline.make_pipeline(
    sampler,
    rf
)

best_model = best_model.fit(X_train,y_train)

In [7]:
y_pred = best_model.predict(X_test)

print("\nTest metrics")
print("Test report: \n", metrics.classification_report(y_test, y_pred))


Test metrics
Test report: 
               precision    recall  f1-score   support

           0       0.95      0.93      0.94       382
           1       0.81      0.87      0.84       133

    accuracy                           0.91       515
   macro avg       0.88      0.90      0.89       515
weighted avg       0.92      0.91      0.91       515

