In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from bs4 import BeautifulSoup

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import ShuffleSplit

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
def normalizedata(X_train):
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    return X_train

def model1(trdata,tract,tsdata):
    model = LogisticRegression(solver='lbfgs',max_iter=2000).fit(trdata,tract)
    pred = model.predict(tsdata)
    return pred

def model2(trdata,tract,tsdata):
    model = MultinomialNB()
    model.fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

def model3(trdata,tract,tsdata):
    model = BernoulliNB()
    model.fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

def model4(trdata,tract,tsdata):
    model = GaussianNB()
    model.fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

def model5(trdata,tract,tsdata):
    model = DecisionTreeClassifier().fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

def model6(trdata,tract,tsdata):
    model = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5).fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

def model7(trdata,tract,tsdata):
    model = RandomForestClassifier(n_estimators=10).fit(trdata,tract)
    model.fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

def model8(trdata,tract,tsdata):
    model = ExtraTreesClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0).fit(trdata,tract)
    model.fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

def model9(trdata,tract,tsdata):
    model = AdaBoostClassifier(n_estimators=10).fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

def model10(trdata,tract,tsdata):
    model = GradientBoostingClassifier(n_estimators=10, learning_rate=1.0,max_depth=1, random_state=0).fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

def model11(trdata,tract,tsdata):
    model = SVC(kernel='poly').fit(trdata,tract)
    pred= model.predict(tsdata)
    return pred

In [8]:
from sklearn.model_selection import KFold
kf = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) 

In [9]:
def gen_models(kf, fname, out, i):
    data=np.genfromtxt(fname,delimiter=',')
    data[:,0:-1]=normalizedata(data[:,0:-1])
    predvalue = np.zeros((np.shape(data)[0],11))
    
    for train_index, test_index in kf.split(data):
        trdata=data[train_index,0:-1]
        tsdata=data[test_index,0:-1]
        tract=out[train_index]
        tsact=out[test_index]
        
        predvalue[test_index,0]=model1(trdata,tract,tsdata)
        predvalue[test_index,1]=model2(trdata,tract,tsdata)
        predvalue[test_index,2]=model3(trdata,tract,tsdata)
        predvalue[test_index,3]=model4(trdata,tract,tsdata)
        predvalue[test_index,4]=model5(trdata,tract,tsdata)
        predvalue[test_index,5]=model6(trdata,tract,tsdata)
        predvalue[test_index,6]=model7(trdata,tract,tsdata)
        predvalue[test_index,7]=model8(trdata,tract,tsdata)
        predvalue[test_index,8]=model9(trdata,tract,tsdata)
        predvalue[test_index,9]=model10(trdata,tract,tsdata)
        predvalue[test_index,10]=model11(trdata,tract,tsdata)
        
    predvalue = np.column_stack((predvalue, out))
        
    fname = str(i) + '.csv'
    np.savetxt(fname, predvalue, delimiter = ',', fmt = '%f')

In [10]:
i = 1
path = '/content/gdrive/My Drive/HPC_SSA/'
fn=['AppReviews','JIRA','StackOverflow']
for k in range(0,3):
    fname = path + fn[k]+ '.csv'
    df = pd.read_csv(fname,encoding='utf-8')
    out = df['oracle']
    out = out.to_numpy()
    
    gen_models(kf, path + fn[k] + 'cv.csv', out, i)
    i = i + 1
    gen_models(kf, path + fn[k] + 'tfidf.csv', out, i)
    i = i + 1
    gen_models(kf, path + fn[k] + 'cbow.csv', out, i)
    i = i + 1
    gen_models(kf, path + fn[k] + 'skg.csv', out, i)
    i = i + 1
    gen_models(kf, path + fn[k] + 'w2v.csv', out, i)
    i = i + 1    
    gen_models(kf, path + fn[k] + 'glove.csv', out, i)
    i = i + 1