# A Practical User Feedback Classifier for Software Quality Characteristics

This is a notebook that shows the application of the code available in the repository according to the article "*A Practical User Feedback Classifier for Software Quality Characteristics*" by Rubens dos Santos, Karina Villela, Diego Toralles Avila and Lucineia Heloisa Thom. 

## Libraries

Importing the libraries needed.

In [None]:
import pandas as pd
from math import floor
from random import shuffle
import numpy as np
import time

from imblearn.over_sampling import SMOTE

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import chi2
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier

from joblib import Parallel, delayed

## Global variables

These are the global variables used. 

In [None]:
TP_NB, TN_NB, FP_NB, FN_NB = 0, 0, 0, 0
TP_NBtf, TN_NBtf, FP_NBtf, FN_NBtf = 0, 0, 0, 0
TP_TREE, TN_TREE, FP_TREE, FN_TREE = 0, 0, 0, 0
TP_TREEtf, TN_TREEtf, FP_TREEtf, FN_TREEtf = 0, 0, 0, 0
TP_SVM, TN_SVM, FP_SVM, FN_SVM = 0, 0, 0, 0
TP_SVMtf, TN_SVMtf, FP_SVMtf, FN_SVMtf = 0, 0, 0, 0
TP_LR, TN_LR, FP_LR, FN_LR = 0, 0, 0, 0
TP_LRtf, TN_LRtf, FP_LRtf, FN_LRtf = 0, 0, 0, 0
TP_RF, TN_RF, FP_RF, FN_RF = 0, 0, 0, 0
TP_RFtf, TN_RFtf, FP_RFtf, FN_RFtf = 0, 0, 0, 0
TP_Grad, TN_Grad, FP_Grad, FN_Grad = 0, 0, 0, 0
TP_Gradtf, TN_Gradtf, FP_Gradtf, FN_Gradtf = 0, 0, 0, 0
#unigrams = []
pred_list = []
text_list = []

## Functions

The codes below show the functions used. 

A function that loads the dataset and returns a list of dataframes corresponding to each category and a list of classes.

In [None]:
def load_data():
    df = pd.read_csv('reviews.csv', '¨')
    df = df.drop(['other', 'Star rating', 'Product'], 'columns')
    df['Functional'].replace('x', '_Label_Func', inplace=True)
    df['Performance'].replace('x', '_Label_Perf', inplace=True)
    df['Compatibility'].replace('x', '_Label_Comp', inplace=True)
    df['Usability'].replace('x', '_Label_Usab', inplace=True)
    df.fillna('_Label_Zero', inplace=True)

    columns = list(df.columns.values)
    columns.remove('Text')

    functional_df = df.drop(['Compatibility', 'Performance', 'Usability'], 'columns')
    performance_df = df.drop(['Compatibility', 'Functional', 'Usability'], 'columns')
    compatibility_df = df.drop(['Functional', 'Performance', 'Usability'], 'columns')
    usability_df = df.drop(['Compatibility', 'Performance', 'Functional'], 'columns')

    functional_df.rename(columns={'Functional': 'Label',
                    'Performance': 'Label',
                    'Compatibility': 'Label',
                    'Usability': 'Label'},
           inplace=True)
    compatibility_df.rename(columns={'Functional': 'Label',
                    'Performance': 'Label',
                    'Compatibility': 'Label',
                    'Usability': 'Label'},
           inplace=True)
    usability_df.rename(columns={'Functional': 'Label',
                    'Performance': 'Label',
                    'Compatibility': 'Label',
                    'Usability': 'Label'},
           inplace=True)
    performance_df.rename(columns={'Functional': 'Label',
                    'Performance': 'Label',
                    'Compatibility': 'Label',
                    'Usability': 'Label'},
           inplace=True)

    dfs = []
    dfs.append(functional_df)
    dfs.append(performance_df)
    dfs.append(compatibility_df)
    dfs.append(usability_df)
    return dfs, columns

A function that creates 2 lists: indexes for train and indexes for test. These lists will be used in the Normal 10-Fold Cross-Validation with ***undersampling***.

In [None]:
def getIndexesUS(df): #10fold Cross Validation with under sampling
    label_0s = df.index[df['Label'] == '_Label_Zero'].tolist()  # contains indexes which column label matches label_zero
    label_relevants = df.index[df['Label'] != '_Label_Zero'].tolist()
    #shuffle(label_0s)
    #shuffle(label_relevants)
    original_ratio = len(label_relevants)/len(label_0s)
    list_of_training_indexes_lists = []
    list_of_testing_indexes_lists = []
    label_relevants_fold_size = floor(len(label_relevants) / 10)
    label_0_fold_size = floor(label_relevants_fold_size / original_ratio)
    for i in range(10):
        # number of irrelevant features to retrieve from label_0 is 9/10*fold_size because of 10 fold cross validation
        label_0s_downsample_train = label_0s[:i * label_0_fold_size]
        label_0s_downsample_train.extend(label_0s[(i + 1) * label_0_fold_size:10 * label_0_fold_size])
        label_relevants_train = label_relevants[:i * label_relevants_fold_size]
        label_relevants_train.extend(label_relevants[(i + 1) * label_relevants_fold_size:10 * label_relevants_fold_size])
        training_indexes = label_0s_downsample_train
        training_indexes.extend(label_relevants_train)
        #shuffle(training_indexes)
        list_of_training_indexes_lists.append(training_indexes)

        #shuffle(label_0s)
        label_0s_test = label_0s[i * label_0_fold_size:(i+1)*label_0_fold_size]
        label_relevants_test = label_relevants[i * label_relevants_fold_size:(i + 1) * label_relevants_fold_size]
        testing_indexes = label_relevants_test
        testing_indexes.extend(label_0s_test)
        #shuffle(testing_indexes)
        list_of_testing_indexes_lists.append(testing_indexes)
    return list_of_training_indexes_lists, list_of_testing_indexes_lists

A function that creates 2 lists: indexes for train and indexes for test. These lists will be used in the Normal 10-Fold Cross-Validation ***without undersampling***.

In [None]:
def getIndexesCV(df): #Normal 10fold Cross-Validation
    indexes = [item for item in range(len(df))]
    #shuffle(indexes) #gives error if not commented, dunno why
    fold_size = floor(len(indexes) / 10) #1500 / 10 = 150
    list_of_training_indexes_lists = []
    list_of_testing_indexes_lists = []

    for i in range(10):
        labels_train_fold = indexes[:i * fold_size]
        labels_train_fold.extend(indexes[(i + 1) * fold_size:])
        list_of_training_indexes_lists.append(labels_train_fold)

        labels_test_fold = indexes[i * fold_size:(i + 1) * fold_size]
        list_of_testing_indexes_lists.append(labels_test_fold)
    return list_of_training_indexes_lists, list_of_testing_indexes_lists

A function that compares the predictions with the expected results and returns parameters for the Confusion Matrix as True negatives, False negatives, True positives and False positives. 

In [None]:
def confusionMatrix(predictions, real_labels):
    TrueNegatives = 0
    FalseNegatives = 0
    TruePositives = 0
    FalsePositives = 0

    predictions = predictions.tolist()
    if(not isinstance(real_labels, list)):
        real_labels = real_labels.tolist()

    for i in range(len(predictions)):
        if(predictions[i] == "_Label_Zero"):
            if(predictions[i] == real_labels[i]):
                TrueNegatives +=1
            else:
                FalseNegatives+=1
        else:
            if(predictions[i] == real_labels[i]):
                TruePositives +=1
            else:
                FalsePositives +=1
    return TruePositives, TrueNegatives, FalsePositives, FalseNegatives

A function that calculates metrics: precision, recall, F1-measure and F2-measure.

In [None]:
def evaluate(TruePositives, TrueNegatives, FalsePositives, FalseNegatives):
    if(TruePositives + FalsePositives == 0):
        precision = -1
    else:
        precision = (TruePositives)/(TruePositives + FalsePositives)
    if (TruePositives + FalseNegatives == 0):
        recall = -1
    else:
        recall = TruePositives/(TruePositives + FalseNegatives)
    f1measure = 2*precision*recall/(precision + recall)

    f2measure = 5*TruePositives/(5*TruePositives + 4*FalseNegatives + FalsePositives)

    return precision, recall, f1measure, f2measure

A function that prints the Confusion Matrix. 

In [None]:
def printConfusionMatrix(TruePositives, TrueNegatives, FalsePositives, FalseNegatives):
    print("Confusion Matrix:")
    print("\t"+str(TruePositives) + "|" + str(FalseNegatives))
    print("\t"+str(FalsePositives)+ "|" + str(TrueNegatives)+"\n")

A function that prints the cost.

In [None]:
def printEvalCost(TruePositives, TrueNegatives, FalsePositives, FalseNegatives, ratio):
    cost = 0*TruePositives + 0*TrueNegatives + 1*FalsePositives + FalseNegatives/ratio
    print("Cost: " + str(cost) + "\n")

A function that trains the models/algorithms using or not the balancing techniques and makes the predictions, updating the parameters of the Confusion Matrix. This is done for each iteration of the Cross-Validation.

In [None]:
def thread_code(j, getIndexes, df, fold_size_0, fold_size_relevant, i, costLearningWeight):
    global TP_NB, TN_NB, FP_NB, FN_NB, TP_NBtf, TN_NBtf, FP_NBtf, FN_NBtf,TP_TREE, TN_TREE, FP_TREE, FN_TREE,TP_TREEtf, TN_TREEtf, FP_TREEtf, FN_TREEtf,TP_SVM, TN_SVM, FP_SVM, FN_SVM,TP_SVMtf, TN_SVMtf, FP_SVMtf, FN_SVMtf,TP_LR, TN_LR, FP_LR, FN_LR,TP_LRtf, TN_LRtf, FP_LRtf, FN_LRtf,TP_RF, TN_RF, FP_RF, FN_RF,TP_RFtf, TN_RFtf, FP_RFtf, FN_RFtf, TP_Grad, TN_Grad, FP_Grad, FN_Grad, TP_Gradtf, TN_Gradtf, FP_Gradtf, FN_Gradtf, pred_list, text_list #, unigrams

    label_0s = df.index[df['Label'] == '_Label_Zero'].tolist()  # contains indexes which column label matches label_zero
    label_relevants = df.index[df['Label'] != '_Label_Zero'].tolist()
    if (getIndexes == False):  # IF SMOTE:

        index_train_zero = label_0s[:j * fold_size_0]
        index_train_zero.extend(label_0s[(j + 1) * fold_size_0:])
        index_train_relevant = label_relevants[:j * fold_size_relevant]
        index_train_relevant.extend(label_relevants[(j + 1) * fold_size_relevant:])
        index_train = index_train_relevant
        index_train.extend(index_train_zero)

        test_zero = label_0s[j * fold_size_0:(j + 1) * fold_size_0]
        test_relevant = label_relevants[j * fold_size_relevant:(j + 1) * fold_size_relevant]
        index_test = test_zero
        index_test.extend(test_relevant)

        corpus = df['Text']
        corpus_train = [corpus[k] for k in index_train]
        corpus_test = [corpus[k] for k in index_test]
        labels = df['Label']
        labels_train = [labels[k] for k in index_train]
        labels_test = [labels[k] for k in index_test]

        # Bag of Words
        vectorizer = CountVectorizer(stop_words='english')
        bow_train = vectorizer.fit_transform(corpus_train)
        bow_train = bow_train.toarray()
        bow_test = vectorizer.transform(corpus_test)
        # Term Frequency - Inverse Document Frequency
        transformer = TfidfTransformer(smooth_idf=False)
        tfidf_train = transformer.fit_transform(bow_train)
        tfidf_train.toarray()
        tfidf_test = transformer.transform(bow_test)

        # select reviews
        smote = SMOTE(k_neighbors=3)
        bow_train, labels_train_bow = smote.fit_resample(bow_train, labels_train)
        tfidf_train, labels_train_tf = smote.fit_resample(tfidf_train, labels_train)
    else:
        trainll, testll = getIndexes(df)  # contains indexes for training and testing
        # select reviews
        corpus_train = df['Text'].loc[trainll[j]]
        corpus_test = df['Text'].loc[testll[j]]
        # select labels
        labels_train_bow = df['Label'].iloc[trainll[j]]
        labels_train_tf = labels_train_bow
        labels_test = df['Label'].iloc[testll[j]]

        # Bag of Words
        vectorizer = CountVectorizer(stop_words='english')
        bow_train = vectorizer.fit_transform(corpus_train)
        bow_train = bow_train.toarray()
        bow_test = vectorizer.transform(corpus_test)

        # Term Frequency - Inverse Document Frequency
        transformer = TfidfTransformer(smooth_idf=False)
        tfidf_train = transformer.fit_transform(bow_train)
        tfidf_train.toarray()
        tfidf_test = transformer.transform(bow_test)

        '''
        # chi2 to select the best correlated terms
        features_chi2 = chi2(bow_train, labels_train_bow)
        indices = np.argsort(features_chi2[0])
        feature_names = np.array(vectorizer.get_feature_names())[indices]
        unigramst = [v for v in feature_names if len(v.split(' ')) == 1]
        unigrams.extend(unigramst[-5:])'''

    if (costLearningWeight != False):
        if(str(costLearningWeight) == "balanced"):
            weights = 'balanced'
        else:
            weights = {'_Label_Zero': 1, df.iloc[label_relevants[0]]['Label']: costLearningWeight}
        # train, predict and evaluate with Decision Tree - BOW
        TREE = tree.DecisionTreeClassifier(class_weight=weights)
        TREE.fit(bow_train, labels_train_bow)
        predictions = TREE.predict(bow_test)

        TP_TREEt, TN_TREEt, FP_TREEt, FN_TREEt = confusionMatrix(predictions, labels_test)
        TP_TREE += TP_TREEt
        TN_TREE += TN_TREEt
        FP_TREE += FP_TREEt
        FN_TREE += FN_TREEt

        # train, predict and evaluate with Decision Tree - TFIDF
        TREE = tree.DecisionTreeClassifier(class_weight=weights)
        TREE.fit(tfidf_train, labels_train_tf)
        predictions = TREE.predict(tfidf_test)
        TP_TREEtft, TN_TREEtft, FP_TREEtft, FN_TREEtft = confusionMatrix(predictions, labels_test)
        TP_TREEtf += TP_TREEtft
        TN_TREEtf += TN_TREEtft
        FP_TREEtf += FP_TREEtft
        FN_TREEtf += FN_TREEtft

        # train, predict and evaluate with SVM - BOW
        SVM = LinearSVC(class_weight=weights)
        SVM.fit(bow_train, labels_train_bow)
        predictions = SVM.predict(bow_test)
        TP_SVMt, TN_SVMt, FP_SVMt, FN_SVMt = confusionMatrix(predictions, labels_test)
        TP_SVM += TP_SVMt
        TN_SVM += TN_SVMt
        FP_SVM += FP_SVMt
        FN_SVM += FN_SVMt

        # train, predict and evaluate with SVM - TFIDF
        SVM = LinearSVC(class_weight=weights)
        SVM.fit(tfidf_train, labels_train_tf)
        predictions = SVM.predict(tfidf_test)
        TP_SVMtft, TN_SVMtft, FP_SVMtft, FN_SVMtft = confusionMatrix(predictions, labels_test)
        TP_SVMtf += TP_SVMtft
        TN_SVMtf += TN_SVMtft
        FP_SVMtf += FP_SVMtft
        FN_SVMtf += FN_SVMtft

        # train, predict and evaluate with Logistic Regression - BOW
        LR = LogisticRegression(random_state=0, class_weight=weights)
        LR.fit(bow_train, labels_train_bow)
        predictions = LR.predict(bow_test)
        TP_LRt, TN_LRt, FP_LRt, FN_LRt = confusionMatrix(predictions, labels_test)
        TP_LR += TP_LRt
        TN_LR += TN_LRt
        FP_LR += FP_LRt
        FN_LR += FN_LRt

        # train, predict and evaluate with Logistic Regression - TFIDF
        LR = LogisticRegression(random_state=0, class_weight=weights)
        LR.fit(tfidf_train, labels_train_tf)
        predictions = LR.predict(tfidf_test)
        text_list.extend(corpus_test)
        pred_list.extend(predictions)
        TP_LRtft, TN_LRtft, FP_LRtft, FN_LRtft = confusionMatrix(predictions, labels_test)
        TP_LRtf += TP_LRtft
        TN_LRtf += TN_LRtft
        FP_LRtf += FP_LRtft
        FN_LRtf += FN_LRtft
    else:
        # train, predict and evaluate with Multinomial Naive Bayes - BOW
        naive_bayes = MultinomialNB()
        naive_bayes.fit(bow_train, labels_train_bow)
        predictions = naive_bayes.predict(bow_test)
        text_list.extend(corpus_test)
        pred_list.extend(predictions)
        TP_NBt, TN_NBt, FP_NBt, FN_NBt = confusionMatrix(predictions, labels_test)
        TP_NB += TP_NBt
        TN_NB += TN_NBt
        FP_NB += FP_NBt
        FN_NB += FN_NBt

        # train, predict and evaluate with Multinomial Naive Bayes - TFIDF
        naive_bayes = MultinomialNB()
        naive_bayes.fit(tfidf_train, labels_train_tf)
        predictions = naive_bayes.predict(tfidf_test)
        TP_NBtft, TN_NBtft, FP_NBtft, FN_NBtft = confusionMatrix(predictions, labels_test)
        TP_NBtf += TP_NBtft
        TN_NBtf += TN_NBtft
        FP_NBtf += FP_NBtft
        FN_NBtf += FN_NBtft

        # train, predict and evaluate with SVM - BOW
        SVM = LinearSVC()
        SVM.fit(bow_train, labels_train_bow)
        predictions = SVM.predict(bow_test)
        TP_SVMt, TN_SVMt, FP_SVMt, FN_SVMt = confusionMatrix(predictions, labels_test)
        TP_SVM += TP_SVMt
        TN_SVM += TN_SVMt
        FP_SVM += FP_SVMt
        FN_SVM += FN_SVMt

        # train, predict and evaluate with SVM - TFIDF
        SVM = LinearSVC()
        SVM.fit(tfidf_train, labels_train_tf)
        predictions = SVM.predict(tfidf_test)
        TP_SVMtft, TN_SVMtft, FP_SVMtft, FN_SVMtft = confusionMatrix(predictions, labels_test)
        TP_SVMtf += TP_SVMtft
        TN_SVMtf += TN_SVMtft
        FP_SVMtf += FP_SVMtft
        FN_SVMtf += FN_SVMtft

        # train, predict and evaluate with Logistic Regression - BOW
        LR = LogisticRegression(random_state=0)
        LR.fit(bow_train, labels_train_bow)
        predictions = LR.predict(bow_test)
        TP_LRt, TN_LRt, FP_LRt, FN_LRt = confusionMatrix(predictions, labels_test)
        TP_LR += TP_LRt
        TN_LR += TN_LRt
        FP_LR += FP_LRt
        FN_LR += FN_LRt

        # train, predict and evaluate with Logistic Regression - TFIDF
        LR = LogisticRegression(random_state=0)
        LR.fit(tfidf_train, labels_train_tf)
        predictions = LR.predict(tfidf_test)
        TP_LRtft, TN_LRtft, FP_LRtft, FN_LRtft = confusionMatrix(predictions, labels_test)
        TP_LRtf += TP_LRtft
        TN_LRtf += TN_LRtft
        FP_LRtf += FP_LRtft
        FN_LRtf += FN_LRtft

        # train, predict and evaluate with Random Forest - BOW
        RF = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
        RF.fit(bow_train, labels_train_bow)
        predictions = RF.predict(bow_test)
        TP_RFt, TN_RFt, FP_RFt, FN_RFt = confusionMatrix(predictions, labels_test)
        TP_RF += TP_RFt
        TN_RF += TN_RFt
        FP_RF += FP_RFt
        FN_RF += FN_RFt

        # train, predict and evaluate with Random Forest - TFIDF
        RF = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
        RF.fit(tfidf_train, labels_train_tf)
        predictions = RF.predict(tfidf_test)
        TP_RFtft, TN_RFtft, FP_RFtft, FN_RFtft = confusionMatrix(predictions, labels_test)
        TP_RFtf += TP_RFtft
        TN_RFtf += TN_RFtft
        FP_RFtf += FP_RFtft
        FN_RFtf += FN_RFtft

        Grad = GradientBoostingClassifier()
        Grad.fit(bow_train, labels_train_tf)
        predictions = Grad.predict(bow_test)
        TP_Gradt, TN_Gradt, FP_Gradt, FN_Gradt = confusionMatrix(predictions, labels_test)
        TP_Grad += TP_Gradt
        TN_Grad += TN_Gradt
        FP_Grad += FP_Gradt
        FN_Grad += FN_Gradt

        Grad = GradientBoostingClassifier()
        Grad.fit(tfidf_train, labels_train_tf)
        predictions = Grad.predict(tfidf_test)
        TP_Gradtft, TN_Gradtft, FP_Gradtft, FN_Gradtft = confusionMatrix(predictions, labels_test)
        TP_Gradtf += TP_Gradtft
        TN_Gradtf += TN_Gradtft
        FP_Gradtf += FP_Gradtft
        FN_Gradtf += FN_Gradtft

A function that organizes which dataset and data will be used for training and testing each time and prints all the results: metrics of each model and the words most correlated to each category.

In [None]:
def print_evaluation(dfs, classes, getIndexes, title, costLearningWeight):
    global TP_NB, TN_NB, FP_NB, FN_NB, TP_NBtf, TN_NBtf, FP_NBtf, FN_NBtf,TP_TREE, TN_TREE, FP_TREE, FN_TREE,TP_TREEtf, TN_TREEtf, FP_TREEtf, FN_TREEtf,TP_SVM, TN_SVM, FP_SVM, FN_SVM,TP_SVMtf, TN_SVMtf, FP_SVMtf, FN_SVMtf,TP_LR, TN_LR, FP_LR, FN_LR,TP_LRtf, TN_LRtf, FP_LRtf, FN_LRtf,TP_RF, TN_RF, FP_RF, FN_RF,TP_RFtf, TN_RFtf, FP_RFtf, FN_RFtf, TP_Grad, TN_Grad, FP_Grad, FN_Grad, TP_Gradtf, TN_Gradtf, FP_Gradtf, FN_Gradtf, pred_list, text_list #, unigrams
    print("============================================================================")
    print(title)
    for i in range(len(dfs)): #each dataframe corresponds to a different class
        df = dfs[i]
        label = classes[i]

        label_0s = df.index[df['Label'] == '_Label_Zero'].tolist()
        label_relevants = df.index[df['Label'] != '_Label_Zero'].tolist()
        ratio = len(label_relevants)/len(label_0s)
        fold_size_0 = floor(len(label_0s) / 10)
        fold_size_relevant = floor(len(label_relevants) / 10)

        #reset counters
        TP_NB, TN_NB, FP_NB, FN_NB = 0, 0, 0, 0
        TP_NBtf, TN_NBtf, FP_NBtf, FN_NBtf = 0, 0, 0, 0
        TP_TREE, TN_TREE, FP_TREE, FN_TREE = 0, 0, 0, 0
        TP_TREEtf, TN_TREEtf, FP_TREEtf, FN_TREEtf = 0, 0, 0, 0
        TP_SVM, TN_SVM, FP_SVM, FN_SVM = 0, 0, 0, 0
        TP_SVMtf, TN_SVMtf, FP_SVMtf, FN_SVMtf = 0, 0, 0, 0
        TP_LR, TN_LR, FP_LR, FN_LR = 0, 0, 0, 0
        TP_LRtf, TN_LRtf, FP_LRtf, FN_LRtf = 0, 0, 0, 0
        TP_RF, TN_RF, FP_RF, FN_RF = 0, 0, 0, 0
        TP_RFtf, TN_RFtf, FP_RFtf, FN_RFtf = 0, 0, 0, 0
        TP_Grad, TN_Grad, FP_Grad, FN_Grad = 0, 0, 0, 0
        TP_Gradtf, TN_Gradtf, FP_Gradtf, FN_Gradtf = 0, 0, 0, 0
        pred_list = [] #contains the predicted labels
        text_list = [] #contains the predicted texts

        print("----------------------------------------------------------------------------")
        print("label: " + str(label))

        Parallel(n_jobs=-1, require='sharedmem')(delayed(thread_code)(j, getIndexes, df, fold_size_0, fold_size_relevant, i, costLearningWeight) for j in range(10))

        if (costLearningWeight != False):
            #preparation for chi2
            newdf = pd.DataFrame(list(zip(text_list, pred_list)),
                         columns=['Text', 'Label'])
            corpus_pred = newdf['Text']
            labels_pred = newdf['Label']
            vectorizer = CountVectorizer(stop_words='english')
            bow_pred = vectorizer.fit_transform(corpus_pred)
            bow_pred = bow_pred.toarray()

            # chi2 to select the best correlated terms for the predicted labels
            features_chi2 = chi2(bow_pred, labels_pred)
            indices = np.argsort(features_chi2[0])
            feature_names = np.array(vectorizer.get_feature_names())[indices]
            unigramst = [v for v in feature_names if len(v.split(' ')) == 1]
            unigrams = list(dict.fromkeys(unigramst[-5:]))
            print("  . Most correlated unigrams:\n." +str(unigrams))

            #pred_file = pd.DataFrame(text_list, pred_list)
            newdf.to_csv(str(label) + '_predfile.csv', index=False, sep=';')

            precisionSVM, recallSVM, fmeasureSVM, f2measureSVM = evaluate(TP_SVM, TN_SVM, FP_SVM, FN_SVM)
            print("SVM - BOW:\n\tPrecision = " + str(precisionSVM) + "\n\tRecall = " + str(
                recallSVM) + "\n\tF-Measure = " + str(fmeasureSVM)+ "\n\tF2-Measure = " + str(f2measureSVM))
            printConfusionMatrix(TP_SVM, TN_SVM, FP_SVM, FN_SVM)
            printEvalCost(TP_SVM, TN_SVM, FP_SVM, FN_SVM, ratio)

            precisionSVMtf, recallSVMtf, fmeasureSVMtf, f2measureSVMtf = evaluate(TP_SVMtf, TN_SVMtf, FP_SVMtf, FN_SVMtf)
            print("SVM - TF-IDF:\n\tPrecision = " + str(precisionSVMtf) + "\n\tRecall = " + str(
                recallSVMtf) + "\n\tF-Measure = " + str(fmeasureSVMtf)+ "\n\tF2-Measure = " + str(f2measureSVMtf))
            printConfusionMatrix(TP_SVMtf, TN_SVMtf, FP_SVMtf, FN_SVMtf)
            printEvalCost(TP_SVMtf, TN_SVMtf, FP_SVMtf, FN_SVMtf, ratio)

            precisionTREE, recallTREE, fmeasureTREE, f2measureTREE = evaluate(TP_TREE, TN_TREE, FP_TREE, FN_TREE)
            print("TREE - BOW:\n\tPrecision = " + str(precisionTREE) + "\n\tRecall = " + str(
                recallTREE) + "\n\tF-Measure = " + str(fmeasureTREE)+ "\n\tF2-Measure = " + str(f2measureTREE))
            printConfusionMatrix(TP_TREE, TN_TREE, FP_TREE, FN_TREE)
            printEvalCost(TP_TREE, TN_TREE, FP_TREE, FN_TREE, ratio)

            precisionTREEtf, recallTREEtf, fmeasureTREEtf, f2measureTREEtf = evaluate(TP_TREEtf, TN_TREEtf, FP_TREEtf, FN_TREEtf)
            print("TREE - TF-IDF:\n\tPrecision = " + str(precisionTREEtf) + "\n\tRecall = " + str(
                recallTREEtf) + "\n\tF-Measure = " + str(fmeasureTREEtf)+ "\n\tF2-Measure = " + str(f2measureTREEtf))
            printConfusionMatrix(TP_TREEtf, TN_TREEtf, FP_TREEtf, FN_TREEtf)
            printEvalCost(TP_TREEtf, TN_TREEtf, FP_TREEtf, FN_TREEtf, ratio)

            precisionLR, recallLR, fmeasureLR, f2measureLR = evaluate(TP_LR, TN_LR, FP_LR, FN_LR)
            print("LR - BOW:\n\tPrecision = " + str(precisionLR) + "\n\tRecall = " + str(
                recallLR) + "\n\tF-Measure = " + str(fmeasureLR)+ "\n\tF2-Measure = " + str(f2measureLR))
            printConfusionMatrix(TP_LR, TN_LR, FP_LR, FN_LR)
            printEvalCost(TP_LR, TN_LR, FP_LR, FN_LR, ratio)

            precisionLRtf, recallLRtf, fmeasureLRtf, f2measureLRtf = evaluate(TP_LRtf, TN_LRtf, FP_LRtf, FN_LRtf)
            print("LR - TF-IDF:\n\tPrecision = " + str(precisionLRtf) + "\n\tRecall = " + str(
                recallLRtf) + "\n\tF-Measure = " + str(fmeasureLRtf)+ "\n\tF2-Measure = " + str(f2measureLRtf))
            printConfusionMatrix(TP_LRtf, TN_LRtf, FP_LRtf, FN_LRtf)
            printEvalCost(TP_LRtf, TN_LRtf, FP_LRtf, FN_LRtf, ratio)
        else:
            #unigrams = list(dict.fromkeys(unigrams))
            #print("  . Most correlated unigrams:\n." +str(unigrams))
            newdf = pd.DataFrame(list(zip(text_list, pred_list)),
                                 columns=['Text', 'Label'])
            newdf.to_csv(str(label) + '_predfile.csv', index=False, sep=';')

            precisionNB, recallNB, fmeasureNB, f2measureNB = evaluate(TP_NB, TN_NB, FP_NB, FN_NB)
            print("Naive Bayes - BOW:\n\tPrecision = "+ str(precisionNB) + "\n\tRecall = "+str(recallNB) + "\n\tF-Measure = "+str(fmeasureNB)+ "\n\tF2-Measure = " + str(f2measureNB))
            printConfusionMatrix(TP_NB, TN_NB, FP_NB, FN_NB)

            precisionNBtf, recallNBtf, fmeasureNBtf, f2measureNBtf= evaluate(TP_NBtf, TN_NBtf, FP_NBtf, FN_NBtf)
            print("Naive Bayes - TF-IDF:\n\tPrecision = "+ str(precisionNBtf) + "\n\tRecall = "+str(recallNBtf) + "\n\tF-Measure = "+str(fmeasureNBtf)+ "\n\tF2-Measure = " + str(f2measureNBtf))
            printConfusionMatrix(TP_NBtf, TN_NBtf, FP_NBtf, FN_NBtf)

            precisionSVM, recallSVM, fmeasureSVM, f2measureSVM = evaluate(TP_SVM, TN_SVM, FP_SVM, FN_SVM)
            print("SVM - BOW:\n\tPrecision = "+ str(precisionSVM) + "\n\tRecall = "+str(recallSVM) + "\n\tF-Measure = "+str(fmeasureSVM)+ "\n\tF2-Measure = " + str(f2measureSVM))
            printConfusionMatrix(TP_SVM, TN_SVM, FP_SVM, FN_SVM)

            precisionSVMtf, recallSVMtf, fmeasureSVMtf, f2measureSVMtf = evaluate(TP_SVMtf, TN_SVMtf, FP_SVMtf, FN_SVMtf)
            print("SVM - TF-IDF:\n\tPrecision = "+ str(precisionSVMtf) + "\n\tRecall = "+str(recallSVMtf) + "\n\tF-Measure = "+str(fmeasureSVMtf)+ "\n\tF2-Measure = " + str(f2measureSVMtf))
            printConfusionMatrix(TP_SVMtf, TN_SVMtf, FP_SVMtf, FN_SVMtf)

            precisionLR, recallLR, fmeasureLR, f2measureLR = evaluate(TP_LR, TN_LR, FP_LR, FN_LR)
            print("LR - BOW:\n\tPrecision = " + str(precisionLR) + "\n\tRecall = " + str(recallLR) + "\n\tF-Measure = " + str(fmeasureLR)+ "\n\tF2-Measure = " + str(f2measureLR))
            printConfusionMatrix(TP_LR, TN_LR, FP_LR, FN_LR)

            precisionLRtf, recallLRtf, fmeasureLRtf, f2measureLRtf = evaluate(TP_LRtf, TN_LRtf, FP_LRtf, FN_LRtf)
            print("LR - TF-IDF:\n\tPrecision = " + str(precisionLRtf) + "\n\tRecall = " + str(recallLRtf) + "\n\tF-Measure = " + str(fmeasureLRtf)+ "\n\tF2-Measure = " + str(f2measureLRtf))
            printConfusionMatrix(TP_LRtf, TN_LRtf, FP_LRtf, FN_LRtf)

            precisionRF, recallRF, fmeasureRF, f2measureRF = evaluate(TP_RF, TN_RF, FP_RF, FN_RF)
            print("RF - BOW:\n\tPrecision = " + str(precisionRF) + "\n\tRecall = " + str(recallRF) + "\n\tF-Measure = " + str(fmeasureRF)+ "\n\tF2-Measure = " + str(f2measureRF))
            printConfusionMatrix(TP_RF, TN_RF, FP_RF, FN_RF)

            precisionRFtf, recallRFtf, fmeasureRFtf, f2measureRFtf = evaluate(TP_RFtf, TN_RFtf, FP_RFtf, FN_RFtf)
            print("RF - TF-IDF:\n\tPrecision = " + str(precisionRFtf) + "\n\tRecall = " + str(recallRFtf) + "\n\tF-Measure = " + str(fmeasureRFtf)+ "\n\tF2-Measure = " + str(f2measureRFtf))
            printConfusionMatrix(TP_RFtf, TN_RFtf, FP_RFtf, FN_RFtf)

            precisionGrad, recallGrad, fmeasureGrad, f2measureGrad = evaluate(TP_Grad, TN_Grad, FP_Grad, FN_Grad)
            print("Grad - BOW:\n\tPrecision = " + str(precisionGrad) + "\n\tRecall = " + str(
                recallGrad) + "\n\tF-Measure = " + str(fmeasureGrad) + "\n\tF2-Measure = " + str(f2measureGrad))
            printConfusionMatrix(TP_Grad, TN_Grad, FP_Grad, FN_Grad)

            precisionGradtf, recallGradtf, fmeasureGradtf, f2measureGradtf = evaluate(TP_Gradtf, TN_Gradtf, FP_Gradtf, FN_Gradtf)
            print("Grad - TF-IDF:\n\tPrecision = " + str(precisionGradtf) + "\n\tRecall = " + str(
                recallGradtf) + "\n\tF-Measure = " + str(fmeasureGradtf) + "\n\tF2-Measure = " + str(f2measureGradtf))
            printConfusionMatrix(TP_Gradtf, TN_Gradtf, FP_Gradtf, FN_Gradtf)

## Visualizing the dataset

In [None]:
df = pd.read_csv('reviews.csv', '¨', engine = 'python')
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Text,Functional,Performance,Compatibility,Usability,other,Star rating,Product
0,Good,,,,,x,1,119
1,Aaj hum aur tum Hmm,,,,,x,1,119
2,It is not updating,x,,,,,1,119
3,lel,,,,,x,1,119
4,Documents disappear. Sharing them is difficult,x,,,x,,1,119


## Main

The code below is the main function of the program and it prints the results. 

In [None]:
start_time = time.time()
dfs, indexes = load_data()

print_evaluation(dfs, indexes, getIndexesCV, "Normal 10fold - Cross Validation", False) #last parameter stands for cost sensitive learning mode = off
print_evaluation(dfs, indexes, getIndexesUS, "10fold - Cross Validation with UnderSampling", False)
print_evaluation(dfs, indexes, False, "10fold - Cross Validation with SMOTE", False)    #"Get indexes = False" stands for "smote_mode = on"
print_evaluation(dfs, indexes, getIndexesCV, "Cost Sensitive Learning \'balanced\' > 10fold - Cross Validation", "balanced")
print_evaluation(dfs, indexes, getIndexesCV, "Cost Sensitive Learning 1:2 > 10fold - Cross Validation", 2)
print_evaluation(dfs, indexes, getIndexesCV, "Cost Sensitive Learning 1:5 > 10fold - Cross Validation", 5)
print_evaluation(dfs, indexes, getIndexesCV, "Cost Sensitive Learning 1:10 > 10fold - Cross Validation", 10)


print("Execution Time: " + str(floor((time.time() - start_time)/60)) + "min " + str((time.time() - start_time)%60) + "sec")

  dfs, indexes = load_data()
  return func(*args, **kwargs)
  df = df.drop(['other', 'Star rating', 'Product'], 'columns')
  functional_df = df.drop(['Compatibility', 'Performance', 'Usability'], 'columns')
  performance_df = df.drop(['Compatibility', 'Functional', 'Usability'], 'columns')
  compatibility_df = df.drop(['Functional', 'Performance', 'Usability'], 'columns')
  usability_df = df.drop(['Compatibility', 'Performance', 'Functional'], 'columns')


Normal 10fold - Cross Validation
----------------------------------------------------------------------------
label: Functional
Naive Bayes - BOW:
	Precision = 0.5847750865051903
	Recall = 0.3976470588235294
	F-Measure = 0.4733893557422969
	F2-Measure = 0.42483660130718953
Confusion Matrix:
	169|256
	120|955

Naive Bayes - TF-IDF:
	Precision = 0.75
	Recall = 0.03529411764705882
	F-Measure = 0.06741573033707865
	F2-Measure = 0.0436046511627907
Confusion Matrix:
	15|410
	5|1070

SVM - BOW:
	Precision = 0.5611111111111111
	Recall = 0.4752941176470588
	F-Measure = 0.5146496815286624
	F2-Measure = 0.49029126213592233
Confusion Matrix:
	202|223
	158|917

SVM - TF-IDF:
	Precision = 0.5796178343949044
	Recall = 0.42823529411764705
	F-Measure = 0.49255751014884974
	F2-Measure = 0.45183714001986097
Confusion Matrix:
	182|243
	132|943

LR - BOW:
	Precision = 0.5884615384615385
	Recall = 0.36
	F-Measure = 0.4467153284671533
	F2-Measure = 0.3903061224489796
Confusion Matrix:
	153|272
	107|968

LR -



  . Most correlated unigrams:
.['let', 'sign', 'able', 'option', 'google']
SVM - BOW:
	Precision = 0.5454545454545454
	Recall = 0.49411764705882355
	F-Measure = 0.5185185185185186
	F2-Measure = 0.5035971223021583
Confusion Matrix:
	210|215
	175|900

Cost: 718.8235294117648

SVM - TF-IDF:
	Precision = 0.5368663594470046
	Recall = 0.548235294117647
	F-Measure = 0.5424912689173458
	F2-Measure = 0.5459231490159325
Confusion Matrix:
	233|192
	201|874

Cost: 686.6470588235295

TREE - BOW:
	Precision = 0.44017094017094016
	Recall = 0.48470588235294115
	F-Measure = 0.46136618141097424
	F2-Measure = 0.47509225092250923
Confusion Matrix:
	206|219
	262|813

Cost: 815.9411764705883

TREE - TF-IDF:
	Precision = 0.42444444444444446
	Recall = 0.44941176470588234
	F-Measure = 0.4365714285714286
	F2-Measure = 0.4441860465116279
Confusion Matrix:
	191|234
	259|816

Cost: 850.8823529411765

LR - BOW:
	Precision = 0.5461741424802111
	Recall = 0.48705882352941177
	F-Measure = 0.5149253731343284
	F2-Measure



  . Most correlated unigrams:
.['39', 'crashes', 'update', 'sync', 'time']
SVM - BOW:
	Precision = 0.5229357798165137
	Recall = 0.4578313253012048
	F-Measure = 0.48822269807280516
	F2-Measure = 0.4695222405271829
Confusion Matrix:
	114|135
	104|1147

Cost: 782.2530120481928

SVM - TF-IDF:
	Precision = 0.4957983193277311
	Recall = 0.4738955823293173
	F-Measure = 0.4845995893223819
	F2-Measure = 0.47811993517017826
Confusion Matrix:
	118|131
	120|1131

Cost: 778.156626506024

TREE - BOW:
	Precision = 0.39935064935064934
	Recall = 0.4939759036144578
	F-Measure = 0.4416517055655297
	F2-Measure = 0.47162576687116564
Confusion Matrix:
	123|126
	185|1066

Cost: 818.0361445783133

TREE - TF-IDF:
	Precision = 0.33134328358208953
	Recall = 0.4457831325301205
	F-Measure = 0.38013698630136983
	F2-Measure = 0.41697971450037563
Confusion Matrix:
	111|138
	224|1027

Cost: 917.3253012048192

LR - BOW:
	Precision = 0.5186721991701245
	Recall = 0.5020080321285141
	F-Measure = 0.5102040816326531
	F2-Meas



  . Most correlated unigrams:
.['ios', 'onenote', 'apple', 'ipad', 'iphone']
SVM - BOW:
	Precision = 0.5643564356435643
	Recall = 0.4634146341463415
	F-Measure = 0.5089285714285714
	F2-Measure = 0.4806070826306914
Confusion Matrix:
	57|66
	44|1333

Cost: 782.8780487804878

SVM - TF-IDF:
	Precision = 0.6296296296296297
	Recall = 0.4146341463414634
	F-Measure = 0.5
	F2-Measure = 0.44502617801047123
Confusion Matrix:
	51|72
	30|1347

Cost: 836.0487804878048

TREE - BOW:
	Precision = 0.47333333333333333
	Recall = 0.5772357723577236
	F-Measure = 0.5201465201465202
	F2-Measure = 0.5529595015576324
Confusion Matrix:
	71|52
	79|1298

Cost: 661.1463414634146

TREE - TF-IDF:
	Precision = 0.48484848484848486
	Recall = 0.6504065040650406
	F-Measure = 0.5555555555555556
	F2-Measure = 0.60882800608828
Confusion Matrix:
	80|43
	85|1292

Cost: 566.390243902439

LR - BOW:
	Precision = 0.5333333333333333
	Recall = 0.5203252032520326
	F-Measure = 0.5267489711934156
	F2-Measure = 0.5228758169934641
Confus



  . Most correlated unigrams:
.['navigate', 'new', 'user', 'friendly', 'easy']
SVM - BOW:
	Precision = 0.5
	Recall = 0.46496815286624205
	F-Measure = 0.48184818481848185
	F2-Measure = 0.4715762273901809
Confusion Matrix:
	73|84
	73|1270

Cost: 791.547770700637

SVM - TF-IDF:
	Precision = 0.5403225806451613
	Recall = 0.4267515923566879
	F-Measure = 0.47686832740213525
	F2-Measure = 0.4454787234042553
Confusion Matrix:
	67|90
	57|1286

Cost: 826.8726114649681

TREE - BOW:
	Precision = 0.3641025641025641
	Recall = 0.45222929936305734
	F-Measure = 0.4034090909090909
	F2-Measure = 0.4313487241798299
Confusion Matrix:
	71|86
	124|1219

Cost: 859.656050955414

TREE - TF-IDF:
	Precision = 0.3888888888888889
	Recall = 0.445859872611465
	F-Measure = 0.4154302670623145
	F2-Measure = 0.43316831683168316
Confusion Matrix:
	70|87
	110|1233

Cost: 854.2101910828026

LR - BOW:
	Precision = 0.5374149659863946
	Recall = 0.5031847133757962
	F-Measure = 0.5197368421052632
	F2-Measure = 0.5096774193548387




  . Most correlated unigrams:
.['search', 'let', 'able', 'google', 'option']
SVM - BOW:
	Precision = 0.5445026178010471
	Recall = 0.4894117647058824
	F-Measure = 0.5154894671623297
	F2-Measure = 0.49951969260326606
Confusion Matrix:
	208|217
	174|901

Cost: 722.8823529411765

SVM - TF-IDF:
	Precision = 0.5461346633416458
	Recall = 0.5152941176470588
	F-Measure = 0.5302663438256657
	F2-Measure = 0.5211803902903379
Confusion Matrix:
	219|206
	182|893

Cost: 703.0588235294118

TREE - BOW:
	Precision = 0.4485776805251641
	Recall = 0.4823529411764706
	F-Measure = 0.46485260770975056
	F2-Measure = 0.47519703291608717
Confusion Matrix:
	205|220
	252|823

Cost: 808.4705882352941

TREE - TF-IDF:
	Precision = 0.4396355353075171
	Recall = 0.4541176470588235
	F-Measure = 0.44675925925925924
	F2-Measure = 0.45114539504441326
Confusion Matrix:
	193|232
	246|829

Cost: 832.8235294117648

LR - BOW:
	Precision = 0.5484764542936288
	Recall = 0.46588235294117647
	F-Measure = 0.5038167938931297
	F2-Measur



  . Most correlated unigrams:
.['sync', 'drains', 'time', 'battery', 'crashes']
SVM - BOW:
	Precision = 0.5330188679245284
	Recall = 0.4538152610441767
	F-Measure = 0.49023861171366595
	F2-Measure = 0.46771523178807944
Confusion Matrix:
	113|136
	99|1152

Cost: 782.277108433735

SVM - TF-IDF:
	Precision = 0.5698324022346368
	Recall = 0.40963855421686746
	F-Measure = 0.4766355140186916
	F2-Measure = 0.4340425531914894
Confusion Matrix:
	102|147
	77|1174

Cost: 815.5421686746987

TREE - BOW:
	Precision = 0.42857142857142855
	Recall = 0.40963855421686746
	F-Measure = 0.4188911704312115
	F2-Measure = 0.413290113452188
Confusion Matrix:
	102|147
	136|1115

Cost: 874.5421686746987

TREE - TF-IDF:
	Precision = 0.4115384615384615
	Recall = 0.42971887550200805
	F-Measure = 0.4204322200392927
	F2-Measure = 0.42595541401273884
Confusion Matrix:
	107|142
	153|1098

Cost: 866.4216867469879

LR - BOW:
	Precision = 0.588957055214724
	Recall = 0.3855421686746988
	F-Measure = 0.4660194174757282
	F2-Mea



  . Most correlated unigrams:
.['comparable', 'apple', 'searchable', 'iphone', 'watch']
SVM - BOW:
	Precision = 0.5957446808510638
	Recall = 0.45528455284552843
	F-Measure = 0.5161290322580645
	F2-Measure = 0.4778156996587031
Confusion Matrix:
	56|67
	38|1339

Cost: 788.0731707317073

SVM - TF-IDF:
	Precision = 0.7
	Recall = 0.2845528455284553
	F-Measure = 0.40462427745664736
	F2-Measure = 0.32287822878228783
Confusion Matrix:
	35|88
	15|1362

Cost: 1000.170731707317

TREE - BOW:
	Precision = 0.5636363636363636
	Recall = 0.5040650406504065
	F-Measure = 0.5321888412017167
	F2-Measure = 0.5149501661129569
Confusion Matrix:
	62|61
	48|1329

Cost: 730.9024390243902

TREE - TF-IDF:
	Precision = 0.5948275862068966
	Recall = 0.5609756097560976
	F-Measure = 0.5774058577405858
	F2-Measure = 0.5674342105263158
Confusion Matrix:
	69|54
	47|1330

Cost: 651.5365853658536

LR - BOW:
	Precision = 0.6349206349206349
	Recall = 0.3252032520325203
	F-Measure = 0.43010752688172044
	F2-Measure = 0.36036036



  . Most correlated unigrams:
.['chauffeur', 'imho', 'user', 'friendly', 'easy']
SVM - BOW:
	Precision = 0.5289855072463768
	Recall = 0.46496815286624205
	F-Measure = 0.4949152542372881
	F2-Measure = 0.4765013054830287
Confusion Matrix:
	73|84
	65|1278

Cost: 783.547770700637

SVM - TF-IDF:
	Precision = 0.6455696202531646
	Recall = 0.3248407643312102
	F-Measure = 0.43220338983050843
	F2-Measure = 0.3606789250353607
Confusion Matrix:
	51|106
	28|1315

Cost: 934.7388535031847

TREE - BOW:
	Precision = 0.44375
	Recall = 0.45222929936305734
	F-Measure = 0.4479495268138801
	F2-Measure = 0.45050761421319796
Confusion Matrix:
	71|86
	89|1254

Cost: 824.656050955414

TREE - TF-IDF:
	Precision = 0.4691358024691358
	Recall = 0.4840764331210191
	F-Measure = 0.47648902821316613
	F2-Measure = 0.4810126582278481
Confusion Matrix:
	76|81
	86|1257

Cost: 778.8853503184714

LR - BOW:
	Precision = 0.6590909090909091
	Recall = 0.36942675159235666
	F-Measure = 0.473469387755102
	F2-Measure = 0.40502793296



  . Most correlated unigrams:
.['option', 'app', 'google', '39', 'good']
SVM - BOW:
	Precision = 0.5392405063291139
	Recall = 0.5011764705882353
	F-Measure = 0.5195121951219512
	F2-Measure = 0.5083532219570406
Confusion Matrix:
	213|212
	182|893

Cost: 718.2352941176471

SVM - TF-IDF:
	Precision = 0.5177453027139874
	Recall = 0.5835294117647059
	F-Measure = 0.5486725663716815
	F2-Measure = 0.5690683799908215
Confusion Matrix:
	248|177
	231|844

Cost: 678.7058823529412

TREE - BOW:
	Precision = 0.4148936170212766
	Recall = 0.4588235294117647
	F-Measure = 0.435754189944134
	F2-Measure = 0.44930875576036866
Confusion Matrix:
	195|230
	275|800

Cost: 856.7647058823529

TREE - TF-IDF:
	Precision = 0.4351648351648352
	Recall = 0.46588235294117647
	F-Measure = 0.45
	F2-Measure = 0.4593967517401392
Confusion Matrix:
	198|227
	257|818

Cost: 831.1764705882354

LR - BOW:
	Precision = 0.5251641137855579
	Recall = 0.5647058823529412
	F-Measure = 0.54421768707483
	F2-Measure = 0.5563282336578581
Co



  . Most correlated unigrams:
.['battery', 'crashes', 'update', 'sync', 'time']
SVM - BOW:
	Precision = 0.5110132158590308
	Recall = 0.46586345381526106
	F-Measure = 0.4873949579831933
	F2-Measure = 0.4742436631234669
Confusion Matrix:
	116|133
	111|1140

Cost: 779.2048192771084

SVM - TF-IDF:
	Precision = 0.5
	Recall = 0.4538152610441767
	F-Measure = 0.47578947368421054
	F2-Measure = 0.4623567921440262
Confusion Matrix:
	113|136
	113|1138

Cost: 796.277108433735

TREE - BOW:
	Precision = 0.3887147335423197
	Recall = 0.4979919678714859
	F-Measure = 0.4366197183098591
	F2-Measure = 0.4714828897338403
Confusion Matrix:
	124|125
	195|1056

Cost: 823.0120481927711

TREE - TF-IDF:
	Precision = 0.3188854489164087
	Recall = 0.41365461847389556
	F-Measure = 0.36013986013986016
	F2-Measure = 0.3904473085670963
Confusion Matrix:
	103|146
	220|1031

Cost: 953.5180722891566

LR - BOW:
	Precision = 0.5213675213675214
	Recall = 0.4899598393574297
	F-Measure = 0.505175983436853
	F2-Measure = 0.495934



  . Most correlated unigrams:
.['android', 'apple', 'ipad', 'watch', 'iphone']
SVM - BOW:
	Precision = 0.5894736842105263
	Recall = 0.45528455284552843
	F-Measure = 0.5137614678899083
	F2-Measure = 0.47700170357751276
Confusion Matrix:
	56|67
	39|1338

Cost: 789.0731707317073

SVM - TF-IDF:
	Precision = 0.6984126984126984
	Recall = 0.35772357723577236
	F-Measure = 0.4731182795698925
	F2-Measure = 0.3963963963963964
Confusion Matrix:
	44|79
	19|1358

Cost: 903.4146341463414

TREE - BOW:
	Precision = 0.5263157894736842
	Recall = 0.5691056910569106
	F-Measure = 0.5468749999999999
	F2-Measure = 0.56
Confusion Matrix:
	70|53
	63|1314

Cost: 656.3414634146341

TREE - TF-IDF:
	Precision = 0.5214285714285715
	Recall = 0.5934959349593496
	F-Measure = 0.5551330798479087
	F2-Measure = 0.5775316455696202
Confusion Matrix:
	73|50
	67|1310

Cost: 626.7560975609756

LR - BOW:
	Precision = 0.5698924731182796
	Recall = 0.43089430894308944
	F-Measure = 0.4907407407407407
	F2-Measure = 0.452991452991453




  . Most correlated unigrams:
.['navigate', 'confusing', 'friendly', 'user', 'easy']
SVM - BOW:
	Precision = 0.5
	Recall = 0.4713375796178344
	F-Measure = 0.4852459016393443
	F2-Measure = 0.47680412371134023
Confusion Matrix:
	74|83
	74|1269

Cost: 783.9936305732484

SVM - TF-IDF:
	Precision = 0.5555555555555556
	Recall = 0.3821656050955414
	F-Measure = 0.45283018867924535
	F2-Measure = 0.4076086956521739
Confusion Matrix:
	60|97
	48|1295

Cost: 877.751592356688

TREE - BOW:
	Precision = 0.4293785310734463
	Recall = 0.4840764331210191
	F-Measure = 0.4550898203592814
	F2-Measure = 0.4720496894409938
Confusion Matrix:
	76|81
	101|1242

Cost: 793.8853503184714

TREE - TF-IDF:
	Precision = 0.39344262295081966
	Recall = 0.4585987261146497
	F-Measure = 0.4235294117647059
	F2-Measure = 0.4438964241676942
Confusion Matrix:
	72|85
	111|1232

Cost: 838.1019108280256

LR - BOW:
	Precision = 0.5772357723577236
	Recall = 0.45222929936305734
	F-Measure = 0.5071428571428571
	F2-Measure = 0.4727030625



  . Most correlated unigrams:
.['app', 'google', '39', 'nice', 'good']
SVM - BOW:
	Precision = 0.5445544554455446
	Recall = 0.5176470588235295
	F-Measure = 0.5307599517490954
	F2-Measure = 0.5228136882129277
Confusion Matrix:
	220|205
	184|891

Cost: 702.5294117647059

SVM - TF-IDF:
	Precision = 0.5118577075098815
	Recall = 0.6094117647058823
	F-Measure = 0.5563909774436091
	F2-Measure = 0.5870353581142339
Confusion Matrix:
	259|166
	247|828

Cost: 666.8823529411765

TREE - BOW:
	Precision = 0.4025423728813559
	Recall = 0.4470588235294118
	F-Measure = 0.42363433667781497
	F2-Measure = 0.43738489871086556
Confusion Matrix:
	190|235
	282|793

Cost: 876.4117647058824

TREE - TF-IDF:
	Precision = 0.3856041131105398
	Recall = 0.35294117647058826
	F-Measure = 0.36855036855036855
	F2-Measure = 0.35902345619913834
Confusion Matrix:
	150|275
	239|836

Cost: 934.5882352941177

LR - BOW:
	Precision = 0.4990476190476191
	Recall = 0.6164705882352941
	F-Measure = 0.551578947368421
	F2-Measure = 0.58



  . Most correlated unigrams:
.['sync', 'location', 'app', 'update', 'time']
SVM - BOW:
	Precision = 0.5021834061135371
	Recall = 0.46184738955823296
	F-Measure = 0.48117154811715485
	F2-Measure = 0.46938775510204084
Confusion Matrix:
	115|134
	114|1137

Cost: 787.2289156626506

SVM - TF-IDF:
	Precision = 0.4854771784232365
	Recall = 0.46987951807228917
	F-Measure = 0.4775510204081633
	F2-Measure = 0.47291835084882783
Confusion Matrix:
	117|132
	124|1127

Cost: 787.1807228915662

TREE - BOW:
	Precision = 0.33519553072625696
	Recall = 0.4819277108433735
	F-Measure = 0.39538714991762763
	F2-Measure = 0.4431314623338257
Confusion Matrix:
	120|129
	238|1013

Cost: 886.1084337349397

TREE - TF-IDF:
	Precision = 0.27485380116959063
	Recall = 0.37751004016064255
	F-Measure = 0.31810490693739424
	F2-Measure = 0.351270553064275
Confusion Matrix:
	94|155
	248|1003

Cost: 1026.734939759036

LR - BOW:
	Precision = 0.47038327526132406
	Recall = 0.5421686746987951
	F-Measure = 0.5037313432835822
	F2



  . Most correlated unigrams:
.['watch', 'devices', 'apple', 'ipad', 'iphone']
SVM - BOW:
	Precision = 0.5913978494623656
	Recall = 0.44715447154471544
	F-Measure = 0.5092592592592593
	F2-Measure = 0.4700854700854701
Confusion Matrix:
	55|68
	38|1339

Cost: 799.2682926829268

SVM - TF-IDF:
	Precision = 0.6617647058823529
	Recall = 0.36585365853658536
	F-Measure = 0.4712041884816754
	F2-Measure = 0.4017857142857143
Confusion Matrix:
	45|78
	23|1354

Cost: 896.2195121951219

TREE - BOW:
	Precision = 0.48344370860927155
	Recall = 0.5934959349593496
	F-Measure = 0.5328467153284672
	F2-Measure = 0.567651632970451
Confusion Matrix:
	73|50
	78|1299

Cost: 637.7560975609756

TREE - TF-IDF:
	Precision = 0.47096774193548385
	Recall = 0.5934959349593496
	F-Measure = 0.5251798561151079
	F2-Measure = 0.5641421947449768
Confusion Matrix:
	73|50
	82|1295

Cost: 641.7560975609756

LR - BOW:
	Precision = 0.5391304347826087
	Recall = 0.5040650406504065
	F-Measure = 0.5210084033613446
	F2-Measure = 0.510

