In [1]:
import classifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re


test_file = "./evaluation_examples.csv"
test_data = classifier.load_data(test_file)
test_text = test_data[0]

#as labels we use the predictions of the classifier
polarity_prediction = classifier.classify(test_text, './polarity_classifier.sav')
domain_prediction = classifier.classify(test_text, './domain_classifier.sav')

test_text_0_3 = []
test_text_1_3 = []
test_text_0_4 = []
test_text_1_4 = []

#split the reviews according to their predictions
for idx in range(len(polarity_prediction)):
    if domain_prediction[idx] == 0 and polarity_prediction[idx] == 3:
        test_text_0_3.append(test_text[idx])
    elif domain_prediction[idx] == 1 and polarity_prediction[idx] == 3:
        test_text_1_3.append(test_text[idx])
    elif domain_prediction[idx] == 0 and polarity_prediction[idx] == 4:
        test_text_0_4.append(test_text[idx])
    else:
        test_text_1_4.append(test_text[idx])



In [2]:
#create a tf-idf list for each set of reviews
vectorizer = TfidfVectorizer(ngram_range=(1,3))
def create_tf_idf_list(documents):
    tf_idf_matrix = vectorizer.fit_transform(documents)
    #use the sum of tf-idf values across all documents as the overall score for each ngram
    scores = zip(vectorizer.get_feature_names(),
                     np.asarray(tf_idf_matrix.sum(axis=0)).ravel())
    
    #sort the list of ngrams by the previously calculated score
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

    return sorted_scores

sorted_scores_0_3 = create_tf_idf_list(test_text_0_3)
sorted_scores_1_3 = create_tf_idf_list(test_text_1_3)
sorted_scores_0_4 = create_tf_idf_list(test_text_0_4)
sorted_scores_1_4 = create_tf_idf_list(test_text_1_4)

In [3]:
#use the tf-idf scores to compute a measure on polarity for each ngram and filter them out of the lists accordingly
def filter_polarity(pos_list, neg_list):
    for word in neg_list[:]:
        #check if word is contained in pos_list
        tmp = [item for item in pos_list if item[0] == word[0]]
        if len(tmp) == 1:
            q = tmp[0][1]/word[1]

            if q < .2: #the ngram has negative polarity
                pos_list.remove(tmp[0])
            elif q > 5: #the ngram has positive polarity
                neg_list.remove(word)
            else: #the ngram is neutral
                pos_list.remove(tmp[0])
                neg_list.remove(word)

filter_polarity(sorted_scores_0_3, sorted_scores_0_4)
filter_polarity(sorted_scores_1_3, sorted_scores_1_4)

In [4]:
#remove tf-idf scores for all ngrams
sorted_scores_0_3 = [i[0] for i in sorted_scores_0_3]
sorted_scores_1_3 = [i[0] for i in sorted_scores_1_3]
sorted_scores_0_4 = [i[0] for i in sorted_scores_0_4]
sorted_scores_1_4 = [i[0] for i in sorted_scores_1_4]

#map all negative/positive ngrams of one domain to the most dominant token of opposite polarity in the same domain
replacement_dict_domain0 = {i: sorted_scores_0_4[0] for i in sorted_scores_0_3}
replacement_dict_domain0_tmp = {i: sorted_scores_0_3[0] for i in sorted_scores_0_4}
replacement_dict_domain1 = {i: sorted_scores_1_4[0] for i in sorted_scores_1_3}
replacement_dict_domain1_tmp = {i: sorted_scores_1_3[0] for i in sorted_scores_1_4}

#this mapping can be viewed as our learned model
replacement_dict_domain0.update(replacement_dict_domain0_tmp)
replacement_dict_domain1.update(replacement_dict_domain1_tmp)

In [5]:
#get the data to be manipulated
test_file = "./evaluation_examples.csv"
test_data = classifier.load_data(test_file)
test_text = test_data[0]

#use this prediction to apply the correct dictionary to each review
domain_prediction = classifier.classify(test_text, './domain_classifier.sav')

electronics_indices = list(filter(lambda i: domain_prediction[i] == 0, range(len(domain_prediction))))
kitchen_indices = list(filter(lambda i: domain_prediction[i] == 1, range(len(domain_prediction))))

#get the same preprocessor as the one we used to learn the model
preprocessor = vectorizer.build_preprocessor()



In [6]:
#sort the ngrams by length so that longer ngrams get replaced first
rep_sorted_domain0 = sorted(replacement_dict_domain0, key=len, reverse=True)
pattern_domain0 = re.compile('|'.join(r'\b%s\b' % re.escape(s) for s in rep_sorted_domain0))

#apply the mapping defined by the dictionary to each review
for idx in electronics_indices:
    s = test_data.at[idx, 0]
    s = preprocessor(s)
    s = pattern_domain0.sub(lambda match: replacement_dict_domain0[match.group(0)], s)
    test_data.at[idx, 0] = s

In [7]:
rep_sorted_domain1 = sorted(replacement_dict_domain1, key=len, reverse=True)
pattern_domain1 = re.compile('|'.join(r'\b%s\b' % re.escape(s) for s in rep_sorted_domain1))
    
for idx in kitchen_indices:
    s = test_data.at[idx, 0]
    s = preprocessor(s)
    s = pattern_domain1.sub(lambda match: replacement_dict_domain1[match.group(0)], s)
    test_data.at[idx, 0] = s

#write the processed data to a new csv file
test_data.to_csv('processed_evaluation_examples.csv', index=False, header=None)