# Data Import

In [172]:
import pandas as pd
import numpy as np

In [173]:
df = pd.read_csv('CVEFixes_py.csv')

In [174]:
df.head()

Unnamed: 0.1,Unnamed: 0,code,language,safety
0,2,#!/usr/bin/env python\n\nfrom __future__ impor...,py,safe
1,52,# This file is dual licensed under the terms o...,py,vulnerable
2,85,"# Copyright (c) 2010-2012 OpenStack, LLC.\n\n#...",py,vulnerable
3,125,# vim: ft=python fileencoding=utf-8 sts=4 sw=4...,py,safe
4,130,# Copyright 2020 The TensorFlow Authors. All R...,py,safe


# Data Cleaning

## Helper Functions

In [175]:
import re

In [178]:
# remove comments beginning with # or within ''' '''
def remove_comments(code):
    return re.sub(r'#.*|\'\'\'[^\'\']*\'\'\'', '', code)

# check if there are multiple '\n' and combine them into one '\n'
def combine_newlines(code):
    return re.sub(r'\n+', '\n', code)

In [179]:
df.head()

Unnamed: 0.1,Unnamed: 0,code,language,safety
0,2,#!/usr/bin/env python\n\nfrom __future__ impor...,py,safe
1,52,# This file is dual licensed under the terms o...,py,vulnerable
2,85,"# Copyright (c) 2010-2012 OpenStack, LLC.\n\n#...",py,vulnerable
3,125,# vim: ft=python fileencoding=utf-8 sts=4 sw=4...,py,safe
4,130,# Copyright 2020 The TensorFlow Authors. All R...,py,safe


In [180]:
# replace safe, vulnerable with 1,0
df['safety'].replace({'safe':1, 'vulnerable':0}, inplace=True)

In [181]:
# checking that all values under 'safety' are 1,0
df['safety'].unique()

array([1, 0], dtype=int64)

In [182]:
# drop original index
# drop language column since only python language is being studied
df.drop(columns=[df.columns[0], 'language'], axis=1 , inplace=True)

In [183]:
df.head()

Unnamed: 0,code,safety
0,#!/usr/bin/env python\n\nfrom __future__ impor...,1
1,# This file is dual licensed under the terms o...,0
2,"# Copyright (c) 2010-2012 OpenStack, LLC.\n\n#...",0
3,# vim: ft=python fileencoding=utf-8 sts=4 sw=4...,1
4,# Copyright 2020 The TensorFlow Authors. All R...,1


In [184]:
df['code_cleaned'] = df['code'].apply(lambda x: combine_newlines(remove_comments(x)))

In [185]:
df.head()

Unnamed: 0,code,safety,code_cleaned
0,#!/usr/bin/env python\n\nfrom __future__ impor...,1,"\nfrom __future__ import division, absolute_im..."
1,# This file is dual licensed under the terms o...,0,"\nfrom __future__ import absolute_import, divi..."
2,"# Copyright (c) 2010-2012 OpenStack, LLC.\n\n#...",0,"\n"""""" Tests for swift.common.utils """"""\nfrom _..."
3,# vim: ft=python fileencoding=utf-8 sts=4 sw=4...,1,\nimport os\nimport attr\nimport pytest\nimpor...
4,# Copyright 2020 The TensorFlow Authors. All R...,1,"\n""""""Tests for tensorflow.python.framework.con..."
