In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import ensemble_model.preprocesser as preprocesser 
import ensemble_model.combined_model as cm 
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [2]:
neg = pd.read_csv(r'/root/reinforcement_commit/datasets/ffmpeg.csv')
pos = pd.read_csv(r'/root/reinforcement_commit/datasets/qemu.csv', encoding='utf_8_sig')
df = pd.concat([neg[['commit_msg','patch','vulnerability']],pos[['commit_msg','patch','vulnerability']]],axis=0)
df.fillna('', inplace=True)
# 1是100%的意思
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.rename(columns={'vulnerability':'label','patch':'diff','commit_msg':'message'},inplace=True)
df =  df[df['diff'].str.len()<1024]
df 

Unnamed: 0,message,diff,label
4,avformat/hlsenc: Check that data is set If co...,diff --git a/libavformat/hlsenc.c b/libavforma...,0
5,libavcodec/h264_sei: Don't log random user da...,diff --git a/libavcodec/h264_sei.c b/libavcode...,0
7,avconv.c: fix calculation of input file durat...,diff --git a/avtools/avconv.c b/avtools/avconv...,0
8,avcodec/dca_core: always limit frame size to ...,diff --git a/libavcodec/dca_core.c b/libavcode...,0
9,Don't use _tzcnt instrinics with clang for wi...,diff --git a/libavutil/x86/intmath.h b/libavut...,0
...,...,...,...
11899,fix off-by-one generating vmdk backing files ...,diff --git a/block-vmdk.c b/block-vmdk.c\ninde...,1
11900,CRIS: Plug a few temp leaks. Signed-off-by: E...,diff --git a/target-cris/translate.c b/target-...,1
11905,Bugfix: PowerPC 64 slbia never invalidates th...,diff --git a/target-ppc/helper.c b/target-ppc/...,1
11907,"Fix crash in set registers in PPC gdb-stub, b...",diff --git a/target-ppc/helper.c b/target-ppc/...,1


In [3]:
train, temp = train_test_split(df, test_size=0.3, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

In [4]:
# len(df_dataset)

In [5]:
# df_dataset[0]

In [6]:
# Load BERT and CodeBERT models and tokenizers
bert_model = BertModel.from_pretrained('/root/autodl-tmp/active-patcher/models/bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('/root/autodl-tmp/active-patcher/models/bert-base-uncased')

codebert_model = RobertaModel.from_pretrained('/root/autodl-tmp/active-patcher/models/codebert-base')
codebert_tokenizer = RobertaTokenizer.from_pretrained('/root/autodl-tmp/active-patcher/models/codebert-base')

In [None]:
train.reset_index(inplace=True)
val.reset_index(inplace=True)
test.reset_index(inplace=True)

# Create Datasets and DataLoaders
train_dataset = preprocesser.SentencePairDataset(train, bert_tokenizer, codebert_tokenizer,message='message',command='diff',label='label')
val_dataset = preprocesser.SentencePairDataset(val, bert_tokenizer, codebert_tokenizer,message='message',command='diff',label='label')
test_dataset = preprocesser.SentencePairDataset(test, bert_tokenizer, codebert_tokenizer,message='message',command='diff',label='label')
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



# # Initialize the model
# model = em.CombinedModel(bert_model, codebert_model, bert_tokenizer, codebert_tokenizer)

# # Train the model
# model.trainer(train_loader, val_loader,num_epochs=10)
# for batch in train_loader:
#     print(batch)


base_model1 = cm.BaseModel(bert_model)
base_model2 = cm.BaseModel(codebert_model)

# Create stacking model
combined_model = cm.CombinedModel(base_model1, base_model2)
# Train the model
combined_model.trainer(train_loader, val_loader,num_epochs=10, patience=3)

Epoch 1/10:   1%|▏         | 4/281 [00:04<04:49,  1.05s/batch]

In [None]:
test_dataset = preprocesser.SentencePairDataset(test, bert_tokenizer, codebert_tokenizer,message='message',command='diff',label='label')
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
test_acc, test_labels, test_probabilities, test_embeddings, test_predictions = combined_model.evaluate(test_loader)

In [None]:
import torch
import seaborn as sns
from sklearn.cluster import KMeans

In [None]:
res = test.copy()

In [None]:
res['test_labels'] = test_labels
res['test_probabilities'] = test_probabilities
res['test_embeddings'] = test_embeddings
res['test_predictions'] = test_predictions

In [None]:
def plot_tsne(embeddings, labels):
    
    tsne = TSNE(n_components=2, random_state=42)

    print("embedding来啦",embeddings[0].shape)
    # embeddings_cpu = combined.cpu().detach()
    # embeddings_np = torch.stack(embeddings_cpu).numpy()  # 形状：(600, 8, 768)
    
    # 维度变换
    embeddings_np = np.vstack(embeddings) 
    embeddings_2d = tsne.fit_transform(embeddings_np)
    df_tsne = pd.DataFrame(embeddings_2d, columns=['TSNE1', 'TSNE2'])
    df_tsne['Class Name'] = labels # Add labels column from df_train to df_tsne
    df_tsne
    
    fig, ax = plt.subplots(figsize=(8,6)) # Set figsize
    sns.set_style('darkgrid', {"grid.color": ".6", "grid.linestyle": ":"})
    sns.scatterplot(data=df_tsne, x='TSNE1', y='TSNE2', hue='Class Name', palette='hls')
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    plt.title('Scatter plot of news using t-SNE');
    plt.xlabel('TSNE1');
    plt.ylabel('TSNE2');
    plt.axis('equal')

    # Apply KMeans
    kmeans_model = KMeans(n_clusters=2, random_state=42, n_init='auto').fit(embeddings_np)
    labels = kmeans_model.fit_predict(embeddings_np)
    
    df_tsne['Cluster'] = labels
    df_tsne
    fig, ax = plt.subplots(figsize=(8,6)) # Set figsize
    sns.set_style('darkgrid', {"grid.color": ".6", "grid.linestyle": ":"})
    sns.scatterplot(data=df_tsne, x='TSNE1', y='TSNE2', hue='Cluster', palette='magma')
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    plt.title('Scatter plot of news using KMeans Clustering');
    plt.xlabel('TSNE1')
    plt.ylabel('TSNE2')
    plt.axis('equal')

In [None]:
plot_tsne(test_embeddings, test_labels)

In [None]:
# PR curve
combined_model.plot_pr_curve(test_labels, test_probabilities)
# Plot t-SNE for the best model

In [None]:
# cluster to show the quality of embeddings
combined_model.plot_tsne(test_embeddings, test_labels)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix on Dataset I', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    # plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
cm = confusion_matrix(test_labels, test_predictions)

# Call the function to plot the confusion matrix
class_names = test['label'].unique()  # Replace with your class names
plot_confusion_matrix(cm, classes=class_names, normalize=True)  # Set normalize=False if you want counts instead of percentages
plt.savefig('Confusion Matrix on combined_model_Patch_DB')