https://huggingface.co/docs/transformers/tasks/sequence_classification

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1 Import libraries

In [None]:
! pip install transformers torch



In [None]:
SEED = 2023

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

print(f"transformers {transformers.__version__}")
print(f"torch {torch.__version__}")

transformers 4.33.1
torch 2.0.1+cu118


# 2 Load data

In [None]:
# load dataset
data = pd.read_csv('data/CVEFixes.csv')
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31194 entries, 0 to 31193
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   code      31160 non-null  object
 1   language  31194 non-null  object
 2   safety    31194 non-null  object
dtypes: object(3)
memory usage: 731.2+ KB
None


In [None]:
# drop nulls
data.dropna(inplace = True)

# Change target labels in safety column to 1 = vulnerable and 0 = safe
label2id = {'vulnerable':1, 'safe':0}
id2label = {1:'vulnerable', 0:'safe'}
data.loc[:,'safety'] = data['safety'].apply(lambda x:label2id[x])

# Change code to from object to string
data.loc[:,'code'] = data['code'].apply(lambda x: str(x))

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31160 entries, 0 to 31193
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   code      31160 non-null  object
 1   language  31160 non-null  object
 2   safety    31160 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 973.8+ KB


  data.loc[:,'safety'] = data['safety'].apply(lambda x:label2id[x])


## 2.1 EDA

In [None]:
# Length of code (number of characters)
data['length'] = data['code'].apply(lambda x: len(x))

max_char = max(data['length'])
print(f"Max number of characters: {max_char}")

Max number of characters: 33429210


In [None]:
# Check all the langauges
langauges = set(data['language'])
langauges

{'Other',
 'S',
 'as',
 'c',
 'cc',
 'cgi',
 'coffee',
 'conf',
 'cpp',
 'cs',
 'css',
 'ctp',
 'cxx',
 'go',
 'h',
 'hh',
 'htaccess',
 'htm',
 'html',
 'inc',
 'java',
 'jelly',
 'js',
 'json',
 'jsp',
 'lua',
 'm',
 'php',
 'phpt',
 'phtml',
 'pl',
 'pm',
 'py',
 'rb',
 'rs',
 'scala',
 'scss',
 'sh',
 'spec',
 'sql',
 't',
 'tpl',
 'ts',
 'tt',
 'vim',
 'xml',
 'yaml',
 'yml'}

In [None]:
# Visualise distribution
viz_df = data.groupby(["language","safety"]).size().reset_index(name="count")
viz_df.loc[:,'safety'] = viz_df['safety'].apply(lambda x:id2label[x])


fig = px.bar(viz_df, x="safety", y="count", color="safety", facet_col="language",
             facet_col_wrap=6,
             height = 1000)
fig.show()

In [None]:
# Visualise the distribution for the top N lanaguages
TOP_N = 10

# Get the top N languages
top_lang = data.groupby(["language"]).size().reset_index(name="count")
top_lang = top_lang.sort_values('count', ascending = False)
top_lang = top_lang['language'][:TOP_N].tolist()

# Visualise distribution
viz_df = data.groupby(["language","safety"]).size().reset_index(name="count")
viz_df.loc[:,'safety'] = viz_df['safety'].apply(lambda x:id2label[x])

viz_df = viz_df.loc[viz_df['language'].isin(top_lang)]


fig = px.bar(viz_df, x="safety", y="count", color="safety", facet_col="language",
             facet_col_wrap=5,
             title = 'Top 10 lanagauges',
             height = 500)
fig.show()


## 2.1 Data preprocessing
We intend to use **CodeBERT** for medelling. CodeBERT is a pre-trained model for programming language, which is a multi-programming-lingual model pre-trained on NL-PL pairs in 6 programming languages (Python, Java, JavaScript, PHP, Ruby, Go).

In [None]:
# Keep the langauges that CodeBERT was trained on

langauges = ['py', 'java', 'js', 'php', 'rb', 'go']

data = data.loc[data['language'].isin(langauges)]

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11461 entries, 0 to 31193
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   code      11461 non-null  object
 1   language  11461 non-null  object
 2   safety    11461 non-null  int64 
 3   length    11461 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 447.7+ KB


In [None]:
# Split into train, validation and test sets 80:10:10
X_train, X_tmp, y_train, y_tmp = train_test_split(data['code'], data['safety'],
                                                    test_size = 0.2,
                                                    random_state = SEED,
                                                    stratify = data['safety']
                                                    )

X_validation, X_test, y_validation, y_test = train_test_split(X_tmp, y_tmp,
                                                              test_size = 0.5,
                                                              random_state = SEED,
                                                              stratify = y_tmp
                                                              )

print(f"training set: {len(X_train)}")
print(f"validation set: {len(X_validation)}")
print(f"test set: {len(X_test)}")


training set: 9168
validation set: 1146
test set: 1147


In [None]:
X_train[1]

"<?php\n\n\n\n/**\n\n * ownCloud - user_ldap\n\n *\n\n * @author Dominik Schmidt\n\n * @copyright 2011 Dominik Schmidt dev@dominik-schmidt.de\n\n *\n\n * This library is free software; you can redistribute it and/or\n\n * modify it under the terms of the GNU AFFERO GENERAL PUBLIC LICENSE\n\n * License as published by the Free Software Foundation; either\n\n * version 3 of the License, or any later version.\n\n *\n\n * This library is distributed in the hope that it will be useful,\n\n * but WITHOUT ANY WARRANTY; without even the implied warranty of\n\n * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\n * GNU AFFERO GENERAL PUBLIC LICENSE for more details.\n\n *\n\n * You should have received a copy of the GNU Affero General Public\n\n * License along with this library.  If not, see <http://www.gnu.org/licenses/>.\n\n *\n\n */\n\n$params = array('ldap_host', 'ldap_port', 'ldap_dn', 'ldap_agent_password', 'ldap_base', 'ldap_base_users', 'ldap_base_groups', 'ldap_userlist

# 3 CodeBERT

In [None]:
# Tokenise and model
codeBERT_classifier = AutoModelForSequenceClassification.from_pretrained('mrm8488/codebert-base-finetuned-detect-insecure-code')
tokenizer = AutoTokenizer.from_pretrained('mrm8488/codebert-base-finetuned-detect-insecure-code')


classifier = pipeline("text-classification", model=codeBERT_classifier, tokenizer=tokenizer)

In [None]:
classifier(X_train[1])

Token indices sequence length is longer than the specified maximum sequence length for this model (1349 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: ignored