In [1]:
def get_indexes_from_file(fileName):
    data = []
    with open(fileName) as f:
        for line in f:
            data.append(int(line.rstrip("\n")))
    return data

In [2]:
def find_common_elements(list1, list2):
    return list(set(list1).intersection(list2))

def find_common_elements_02(list1, list2, list3):
    return list(set(list1) & set(list2) & set(list3))

In [3]:
# Compare oversampling and undersampling in SVM with Hashing vectorizer
svm_oversampling = get_indexes_from_file("indexes/SVM_HashingVectorizer_Oversampling.txt")
svm_undersampling = get_indexes_from_file("indexes/SVM_HashingVectorizer_Undersampling.txt")
common_elements = find_common_elements(svm_oversampling, svm_undersampling)

print("Number of common elements: ", len(common_elements))
print("Percent of common elements in oversampling with SVM: ", (len(common_elements)/len(svm_oversampling))*100)
print("Percent of common elements in undersampling with SVM: ", (len(common_elements)/len(svm_undersampling))*100)

Number of common elements:  17284
Percent of common elements in oversampling with SVM:  74.30143581807239
Percent of common elements in undersampling with SVM:  61.02460897503795


In [4]:
# Compare lancaster stemmer and Snowball stemmer in SVM with Hashing vectorizer
svm_lancaster = get_indexes_from_file("indexes/SVM_Hashing_Lancaster.txt")
svm_snowball = get_indexes_from_file("indexes/SVM_Hashing_SnowballStemmer.txt")
common_elements = find_common_elements(svm_lancaster, svm_snowball)

print("Number of common elements: ", len(common_elements))
print("Percent of common elements in lancaster stemmer with SVM: ", (len(common_elements)/len(svm_lancaster))*100)
print("Percent of common elements in snowball stemmer with SVM: ", (len(common_elements)/len(svm_snowball))*100)

Number of common elements:  22111
Percent of common elements in lancaster stemmer with SVM:  89.40600865310743
Percent of common elements in snowball stemmer with SVM:  90.67087673255146


In [5]:
# Compare Vader and Textblob in SVM with Hashing vectorizer
svm_vader = get_indexes_from_file("indexes/SVM_Hashing_Vader.txt")
svm_textblob = get_indexes_from_file("indexes/SVM_Hashing_Textblob.txt")
common_elements = find_common_elements(svm_vader, svm_textblob)

print("Number of common elements: ", len(common_elements))
print("Percent of common elements in vader lexicon method with SVM: ", (len(common_elements)/len(svm_vader))*100)
print("Percent of common elements in textblob lexicon method with SVM: ", (len(common_elements)/len(svm_textblob))*100)

Number of common elements:  17032
Percent of common elements in vader lexicon method with SVM:  73.48664624412132
Percent of common elements in textblob lexicon method with SVM:  62.698325050616596


In [6]:
# Compare oversampling and undersampling in Naive Bayes with TfIdf vectorizer
nb_oversampling = get_indexes_from_file("indexes/NaiveBayes_TfIdf_Oversampling.txt")
nb_undersampling = get_indexes_from_file("indexes/NaiveBayes_TfIdf_Undersampling.txt")
common_elements = find_common_elements(nb_oversampling, nb_undersampling)

print("Number of common elements: ", len(common_elements))
print("Percent of common elements in oversampling with Naive Bayes: ", (len(common_elements)/len(nb_oversampling))*100)
print("Percent of common elements in undersampling with Naive Bayes: ", (len(common_elements)/len(nb_undersampling))*100)

Number of common elements:  20737
Percent of common elements in oversampling with Naive Bayes:  83.90790644978556
Percent of common elements in undersampling with Naive Bayes:  80.03782469412172


In [7]:
# Compare lancaster stemmer and Snowball stemmer in SVM with TfIdf vectorizer
nb_lancaster = get_indexes_from_file("indexes/NaiveBayes_TfIdf_Lancaster.txt")
nb_snowball = get_indexes_from_file("indexes/NaiveBayes_TfIdf_Snowball.txt")
common_elements = find_common_elements(nb_lancaster, nb_snowball)

print("Number of common elements: ", len(common_elements))
print("Percent of common elements in lancaster stemmer with Naive Bayes: ", (len(common_elements)/len(nb_lancaster))*100)
print("Percent of common elements in snowball stemmer with Naive Bayes: ", (len(common_elements)/len(nb_snowball))*100)

Number of common elements:  21856
Percent of common elements in lancaster stemmer with Naive Bayes:  74.1585233441911
Percent of common elements in snowball stemmer with Naive Bayes:  85.59567635309783


In [8]:
# Compare Vader and Textblob in SVM with TfIdf vectorizer
nb_vader = get_indexes_from_file("indexes/NaiveBayes_TfIdf_Vader.txt")
nb_textblob = get_indexes_from_file("indexes/NaiveBayes_TfIdf_Textblob.txt")
common_elements = find_common_elements(nb_vader, nb_textblob)

print("Number of common elements: ", len(common_elements))
print("Percent of common elements in vader lexicon method with SVM: ", (len(common_elements)/len(nb_vader))*100)
print("Percent of common elements in textblob lexicon method with SVM: ", (len(common_elements)/len(nb_textblob))*100)

Number of common elements:  16659
Percent of common elements in vader lexicon method with SVM:  67.45353686682593
Percent of common elements in textblob lexicon method with SVM:  52.360447573547894


In [9]:
# Compare lexicon based methods
vader = get_indexes_from_file("indexes/Lexicon_Based_Vader.txt")
afinn = get_indexes_from_file("indexes/Lexicon_Based_Afinn.txt")
textblob = get_indexes_from_file("indexes/Lexicon_based_Textblob.txt")
common_elements = find_common_elements_02(vader, afinn, textblob)

print("Number of common elements: ", len(common_elements))
print("Percent of common elements in vader lexicon method: ", (len(common_elements)/len(vader))*100)
print("Percent of common elements in afinn lexicon method: ", (len(common_elements)/len(afinn))*100)
print("Percent of common elements in textblob lexicon method: ", (len(common_elements)/len(textblob))*100)

Number of common elements:  43999
Percent of common elements in vader lexicon method:  60.03984553034128
Percent of common elements in afinn lexicon method:  51.453597155954725
Percent of common elements in textblob lexicon method:  53.0767096517365
