In [1]:
#Essential libraries
import numpy as np
import pandas as pd
import os
import pickle
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ankur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ankur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ankur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#load dataset
df=pd.read_csv('code_snippets_large_dataset.csv')
df.head()

Unnamed: 0,code_snippet,error_name,bug_present
0,a = 'hello'\nb = a + 5,TypeError,True
1,a = 'hello'\nb = a + 5,TypeError,True
2,"def add(a, b):\n return a + b\nprint(add(2,...",,False
3,import os\nos.remove('non_existent_file.txt'),FileNotFoundError,True
4,"lst = [1, 2, 3]\nlst.remove(5)",ValueError,True


In [4]:
df.shape

(5000, 3)

In [5]:
#data preprocessing 
def clean_code(text):
    #remove special charachter
    text=re.sub(r'[^a-zA-Z0-9_\s]',' ',text)
    #tokenization & convert into lower case
    tokens=word_tokenize(text.lower())
    tokens=[word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [6]:
df['cleaned_code']=df['code_snippet'].apply(clean_code)
df.head()

Unnamed: 0,code_snippet,error_name,bug_present,cleaned_code
0,a = 'hello'\nb = a + 5,TypeError,True,hello b 5
1,a = 'hello'\nb = a + 5,TypeError,True,hello b 5
2,"def add(a, b):\n return a + b\nprint(add(2,...",,False,def add b return b print add 2 3
3,import os\nos.remove('non_existent_file.txt'),FileNotFoundError,True,import os os remove non_existent_file txt
4,"lst = [1, 2, 3]\nlst.remove(5)",ValueError,True,lst 1 2 3 lst remove 5


In [7]:
X=df['cleaned_code']
Y=df['bug_present']

In [8]:
#feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
X_tfidf=vectorizer.fit_transform(X)

In [9]:
#train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(X_tfidf, Y, test_size=0.2, random_state=45)

In [10]:
from sklearn.ensemble import RandomForestClassifier
rf_model=RandomForestClassifier(n_estimators=100, random_state=45)
rf_model.fit(X_train,Y_train)

In [11]:
y_pred=rf_model.predict(X_test)

In [12]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(Y_test, y_pred)
cm

array([[208,  51],
       [  0, 741]])

In [13]:
#save model and vectorizer
pickle.dump(rf_model,open('rf_model.pkl','wb'))
pickle.dump(vectorizer, open('tfidf_vectorizer.pkl','wb'))