In [1]:
import nltk
import string
import os
from collections import defaultdict
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering


In [2]:
stop_words = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.SnowballStemmer("english", ignore_stopwords = False)
# Folder path where corpus root should be
corpus_root = r"/Users/boykoborisov/Desktop/Uni/COMP34711/cw2/product_reviews"
# Folder path where the reverse token corpus should be stored
corpus_after_token_reversal = r"/Users/boykoborisov/Desktop/Uni/COMP34711/cw2/product_reviews_processed"
file_pattern = r".*"
original_corpus = nltk.corpus.PlaintextCorpusReader(corpus_root, file_pattern)
print(original_corpus.fileids())

['Canon_PowerShot_SD500.txt', 'Canon_S100.txt', 'Diaper_Champ.txt', 'Hitachi_router.txt', 'Linksys_Router.txt', 'MicroMP3.txt', 'Nokia_6600.txt', 'ipod.txt', 'norton.txt']


In [3]:
# Core utility function for document cleaning
# Works recursively, split the text into sentences/review, then for each 
# sentence/review perform cleaning 
def process_doc(text, remove_punctuation, case_fold, stem,
                remove_stopwords, remove_short_tokens, tokenize_by, stem_blacklist = [],
                remove_nonalphabetical = False):

  if (tokenize_by == "sentence"):
    sentences = nltk.RegexpTokenizer("##", gaps = True).tokenize(text)
    sentences = [process_doc(sentence, remove_punctuation, case_fold, stem, 
                             remove_stopwords, remove_short_tokens, "words", stem_blacklist) 
                  for sentence in sentences]
    return sentences
  if (tokenize_by == "reviews"):
    reviews = nltk.RegexpTokenizer("\[ t \]", gaps = True).tokenize(text)
    reviews = [process_doc(review, remove_punctuation, case_fold, stem, 
                              remove_stopwords, remove_short_tokens, "words", stem_blacklist)
                for review in reviews]
    return reviews
  if (tokenize_by == "words"):
    words = nltk.WordPunctTokenizer().tokenize(text)
    if (remove_punctuation):
      words = [w for w in words if w not in string.punctuation and w != "..." and w != "]##"]
      # words = [w.strip("") for w in words]
    if (case_fold):
      words = [w.lower() for w in words]
    if (remove_short_tokens):
      words = [w for w in words if len(w) > 2]
    if (stem):
      words = [w if w in stem_blacklist else stemmer.stem(w) for w in words]
    if (remove_stopwords):
      words = [w for w in words if w not in stop_words and w != "n't"]
    if (remove_punctuation):
      words = [w for w in words if w not in string.punctuation and w != "..." and w != "]##"]
    if (remove_nonalphabetical):
      words = [w for w in words if w.isalpha()]
    return words

def process_corpus(corpus, remove_punctuation:bool, case_fold:bool, stem:bool,
                  remove_stopwords:bool, remove_short_tokens, tokenize_by:str):
  docs = [word for fileid in corpus.fileids() 
            for word in process_doc(corpus.raw(fileid), remove_punctuation, case_fold,
                                    stem, remove_stopwords, remove_short_tokens, 
                                    tokenize_by)
         ]
  return docs

def most_frequent(words, n, should_print):
  freqDist = nltk.FreqDist(words)
  most_common = freqDist.most_common(n)
  if (should_print):
    i = 1
    for (w, count) in most_common:
      print(i , w , count)
      i += 1
  return most_common

# core function for generating corpus with reversed words
# the corpus of reversed words is stored as files in the path specified by the variable:
# corpus_after_token_reversal
def generate_corpus_half_tokens_reversed(corpus, token_tuple_list, override_folder):
  if not override_folder and os.path.exists(corpus_after_token_reversal):
    return
  if not os.path.exists(corpus_after_token_reversal):
    os.mkdir(corpus_after_token_reversal)
  # indecies_per_word = {word : list of 0s and 1s}
  # if indecies_per_word[word][i] == 1
  # the i-th occurrence of word needs to be reversed
  indecies_per_word = {}
  
  # pointers keeps track of how many occurrences of each word we have met
  pointers = {}
  for (word, frequency) in token_tuple_list:
    # construct an array with an equal number of 0-s and ones
    indecies = np.ones(frequency)
    indecies[:int(frequency/2)] = 0
    
    # shuffle it    
    np.random.shuffle(indecies)
    indecies_per_word[word] = indecies
    pointers[word] = -1
  fileids = corpus.fileids()
  for fileid in fileids:
    # tokenize the document
    tokens = process_doc(corpus.raw(fileid), False, True, False, False, False, "words")
    with_reversal = []
    for token in tokens:
      if (token in indecies_per_word):
        # update the number of occurrences of the token
        pointers[token] += 1
       # determine whether to reverse the token
        if (indecies_per_word[token][pointers[token]] == 1):
         token = token[::-1]
      with_reversal.append(token)
    doc = " ".join(with_reversal)
    
    f = open(os.path.join(corpus_after_token_reversal,fileid), "w")
    f.write(doc)
    f.close()
  
  # for (word, pointer) in pointers.items():
  #   print (word, len(indecies_per_word[word]) - pointer - 1)
  

In [4]:
print("Most frequent 50 tokens in corpus after document cleaning")
processed_corpus = process_corpus(original_corpus, True, True, False, True, True, "words")
most_frequent_tokens = most_frequent(processed_corpus, 50, True)
cluster_words = set()
for (word, freq) in most_frequent_tokens:
  cluster_words.add(word)
  cluster_words.add(word[::-1])
# generate_corpus_half_tokens_reversed(original_corpus, most_frequent_tokens, True)

Most frequent 50 tokens in corpus after document cleaning
1 use 353
2 phone 320
3 one 316
4 ipod 314
5 router 313
6 camera 292
7 player 269
8 get 252
9 battery 239
10 like 195
11 great 192
12 quality 176
13 good 176
14 zen 174
15 diaper 171
16 product 166
17 would 158
18 also 156
19 time 145
20 software 145
21 sound 144
22 well 138
23 really 136
24 micro 136
25 features 128
26 computer 128
27 easy 125
28 even 123
29 first 121
30 used 120
31 creative 118
32 much 115
33 better 114
34 champ 113
35 work 112
36 want 107
37 size 105
38 music 105
39 norton 104
40 little 101
41 need 100
42 pictures 99
43 works 99
44 still 97
45 buy 96
46 problem 96
47 mp3 96
48 price 91
49 life 91
50 using 91


In [5]:
# have a dictionary, where for each word we have its number of coappearences with other words
def generate_term_to_term_dict_single_doc(term_groups, context_window_length, key_words):
  term_to_term = defaultdict(lambda: defaultdict(int))
  for i in range(0, len(term_groups)):
    term_group = term_groups[i]
    for word in term_group:
      if word in key_words:
        min_window = max(0, i - context_window_length)
        max_window = min(len(term_groups), i + context_window_length + 1)
        for j in range(min_window, max_window):
          group = term_groups[j]
          for w in group:
            term_to_term[word][w] += 1
  return term_to_term

# have a dictionary, where for each word we have its number of coappearences with other words
def generate_term_to_term_matrix_single_doc_for_words(document, context_window_length, key_words): 
  term_to_term = defaultdict(lambda: defaultdict(int))
  for i in range(0, len(document)):
    if document[i] in key_words:
      key_word = document[i]
      min_window = max(0, i - context_window_length)
      max_window = min(len(document), i + context_window_length + 1)
      for j in range(min_window, max_window):
        if i != j:
          term_to_term[key_word][document[j]] += 1
  return term_to_term

def merge_term_to_term_dicts(term_to_term_dicts):
  term_to_term = defaultdict(lambda: defaultdict(int))
  for term_to_term_dict in term_to_term_dicts:
    for (key_word, freqs) in term_to_term_dict.items():
      for (value_word, freq) in freqs.items():
        term_to_term[key_word][value_word] += freq
  return term_to_term

def get_key_words_count(term_to_term_dict):
  counts = defaultdict(int)
  for (key, freqs) in term_to_term_dict.items():
    for (context, freq) in freqs.items():
      counts[key] += freq
  return counts

def get_context_words_count(term_to_term_dict):
  counts = defaultdict(int)
  for (key, freqs) in term_to_term_dict.items():
    for (context, freq) in freqs.items():
      counts[context] += freq
  return counts

def get_coocurrences_alpha(context_counts, alpha=0.75):
  sum = 0
  for context_count in context_counts.values():
    sum += pow(context_count, alpha)
  return sum

def generate_context_word_mapping(term_to_term_dict):
  i = 0
  mapping = {}
  for (key, freqs) in term_to_term_dict.items():
    for (context, freq) in freqs.items():
      if context not in mapping:
        mapping[context] = i
        i += 1
  return mapping

def calculate_all_coocurrences(count_dict):
  sum = 0
  for val in count_dict.values():
    sum += val
  return sum

In [6]:
def calculate_pmi_matrix(word_counts, context_counts, term_to_term_dict, 
                         context_mapping, coocurrence_count):
  pmi_matrix = []
  row_to_word = []
  for (word, word_count) in word_counts.items():
    row_to_word.append(word)
    p_word = word_count / coocurrence_count
    row = [0] * len(context_counts)
    for (context, index) in context_mapping.items():
      if context in term_to_term_dict[word]:
        p_context = context_counts[context] / coocurrence_count
        p_word_context = term_to_term_dict[word][context]
        row[index] = np.log2(p_word_context/(p_context * p_word))
    pmi_matrix.append(row)
  return (row_to_word, pmi_matrix)

def calculate_ppmi_matrix(word_counts, context_counts, term_to_term_dict, 
                         context_mapping, coocurrence_count):
  (row_to_word, pmi_matrix) = calculate_pmi_matrix(word_counts, context_counts, term_to_term_dict, 
                         context_mapping, coocurrence_count)
  ppmi_matrix = [[0 if x < 0 else x for x in row] for row in pmi_matrix]

  return (row_to_word, ppmi_matrix)

def calculate_ppmi_alpha_matrix(word_counts, context_counts, term_to_term_dict, 
                         context_mapping, coocurrence_count, context_count_alpha, alpha=0.75):
  pmi_matrix = []
  row_to_word = []
  for (word, word_count) in word_counts.items():
    row_to_word.append(word)
    p_word = word_count / coocurrence_count
    row = [0] * len(context_counts)
    for (context, index) in context_mapping.items():
      if context in term_to_term_dict[word]:
        p_context = pow(context_counts[context],alpha) / context_count_alpha
        p_word_context = term_to_term_dict[word][context]
        row[index] = max(np.log2(p_word_context/(p_context * p_word)), 0)
    pmi_matrix.append(row)
  return (row_to_word, pmi_matrix)

def clustering_get_accuracy(n_clusters, keys, matrix, cluster_method, flag_empty_clusters, print_cluster=False):   
  if (cluster_method == "kmeans"):                
    cluster_algo = KMeans(n_clusters)
  elif (cluster_method == "agglomerative"):
      cluster_algo = AgglomerativeClustering(
        n_clusters=n_clusters
      )
  elif (cluster_method == "agglomerative_complete"):
    cluster_algo = AgglomerativeClustering(
      n_clusters=n_clusters,
      linkage="complete",
      affinity="cosine"
    )
  cluster_algo.fit(matrix)
  clusters = []
  for i in range(50):
    clusters.append(set())
  i = 0
  for label in cluster_algo.labels_:
    clusters[label].add(keys[i])
    i += 1
  correct = 0
  for cluster in clusters:
    if (flag_empty_clusters and len(cluster) == 0):
      print("EMPTY CLUSTER DETECTED")
    if (print_cluster): 
      print(cluster)
    for word in cluster:
      if word[::-1] in cluster:
        correct += 1
  return correct / len(keys)


In [7]:
def run_experiment(generate_reverse_word_corpus, context_type, context_window, original_most_frequent_tokens, key_words, similiarity_measure, iterations, remove_stopwords, stem, cluster_method, verbose):
  accuracy = np.zeros(iterations)
  for i in range(iterations):
    if (generate_reverse_word_corpus):
      generate_corpus_half_tokens_reversed(original_corpus, original_most_frequent_tokens, True)
    corpus = nltk.corpus.PlaintextCorpusReader(corpus_after_token_reversal, file_pattern)
    # partition the corpus based on the context window type, a corpus can be partititoned into sentences, reviews or static word windows
    if (context_type == "sentence" or context_type == "reviews"):
      # document cleaning
      contexts = [process_doc(corpus.raw(fileid), True, True, stem, remove_stopwords, True, context_type, key_words, True) for fileid in corpus.fileids()]
      # contains term to term matricies for each context window
      term_to_term_dicts = [generate_term_to_term_dict_single_doc(context, context_window, key_words) for context in contexts]
    elif (context_type == "words"):
      # document cleaning
      contexts = [process_doc(corpus.raw(fileid), True, True, stem, remove_stopwords, True, context_type, key_words, True) for fileid in corpus.fileids()]
      # contains term to term matricies for each context window
      term_to_term_dicts = [generate_term_to_term_matrix_single_doc_for_words(context, context_window, key_words) for context in contexts]
    
    # merge the term to term matricies into one
    term_to_term_dict = merge_term_to_term_dicts(term_to_term_dicts)

    # used for the denominator probabilities in pmi and its derivables
    word_counts = get_key_words_count(term_to_term_dict)
    context_counts = get_context_words_count(term_to_term_dict)

    # until now we have worked with dictionaries for time and space efficiency,
    # for the following operations we will have to work with matricies

    # utility function - constructs a mapping for each key to an integer
    context_mapping = generate_context_word_mapping(term_to_term_dict)

    # used in the nominator in pmi
    # coocurrence_count = the number of overall coocorrence of any two tokens with the 
    # currently selected context window
    coocurrence_count = calculate_all_coocurrences(word_counts)

    # pmi, ppmi, ppmi with smoothing matricies construction
    if (similiarity_measure == "smooth_ppmi"):
      coocurrence_alpha = get_coocurrences_alpha(context_counts, 0.80)
      (keys, encoding) = calculate_ppmi_alpha_matrix(word_counts, context_counts, term_to_term_dict, context_mapping, coocurrence_count, coocurrence_alpha, 0.80)
    if (similiarity_measure == "ppmi"):
      (keys, encoding) = calculate_ppmi_matrix(word_counts, context_counts, term_to_term_dict, context_mapping, coocurrence_count)
    if (similiarity_measure == "pmi"):
      (keys, encoding) = calculate_pmi_matrix(word_counts, context_counts, term_to_term_dict, context_mapping, coocurrence_count)
    accuracy[i] = clustering_get_accuracy(50, keys, encoding, cluster_method, True, verbose)
    if (verbose):
      print(accuracy)
      
  return ("Average accuracy:", np.mean(accuracy), "Standard deviation:", np.std(accuracy))


In [8]:
print("Different context length experiments")

print("1 review:", run_experiment(generate_reverse_word_corpus = True, context_type = "reviews", 
              context_window = 0, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))

print("Sentences:")
# The sentence with the word
print("1 Sentence:", run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 0, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))
# 1 sentence before the word, the sentence with the word and 1 sentence after the word
print("3 Sentences:", run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))

# 2 sentences before the word, the sentence with the word and 2 sentences after the word
print("5 Sentences:", run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 2, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))

print("7 Sentences:", run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 3, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))

print("2 words:", run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))
print("4 words:", run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 2, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))
print("6 words:", run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 3, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))
print("10 words:", run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 5, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))
print("12 words:", run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 6, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))



Different context length experiments
1 review: ('Average accuracy:', 0.4870000000000001, 'Standard deviation:', 0.0330302891298275)
Sentences:
1 Sentence: ('Average accuracy:', 0.45899999999999996, 'Standard deviation:', 0.03713488925525428)
3 Sentences: ('Average accuracy:', 0.645, 'Standard deviation:', 0.04043513323831147)
5 Sentences: ('Average accuracy:', 0.6769999999999998, 'Standard deviation:', 0.03702701716314723)
7 Sentences: ('Average accuracy:', 0.692, 'Standard deviation:', 0.03969886648255841)
2 words: ('Average accuracy:', 0.8591836734693876, 'Standard deviation:', 0.03528901319549098)
4 words: ('Average accuracy:', 0.7744897959183674, 'Standard deviation:', 0.030595233368499907)
6 words: ('Average accuracy:', 0.7071428571428571, 'Standard deviation:', 0.034316676981224974)
10 words: ('Average accuracy:', 0.6744897959183673, 'Standard deviation:', 0.039507062617563135)
12 words: ('Average accuracy:', 0.6377551020408163, 'Standard deviation:', 0.029486088319999763)


In [9]:
print("Stopwords experiments:")

print("Performance when REMOVING stopwords with a large context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 2, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=True, stem=False, cluster_method="agglomerative", verbose=False
              ))

print("Performance when KEEPING stopwords with a large context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 2, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))

print("Performance when REMOVING stopwords with a small context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=True, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))

print("Performance when KEEPING stopwords with a small context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens,
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))


Stopwords experiments:
Performance when REMOVING stopwords with a large context window: ('Average accuracy:', 0.68, 'Standard deviation:', 0.03162277660168378)
Performance when KEEPING stopwords with a large context window: ('Average accuracy:', 0.7010000000000001, 'Standard deviation:', 0.02998332870112988)
Performance when REMOVING stopwords with a small context window: ('Average accuracy:', 0.7724489795918366, 'Standard deviation:', 0.03666376372086059)
Performance when KEEPING stopwords with a small context window: ('Average accuracy:', 0.8551020408163266, 'Standard deviation:', 0.0392028014536705)


In [10]:
print("Stemming experiments:")

print("Performance when stemming with a large context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 2, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=True, cluster_method="agglomerative", verbose=False
              ))

print("Performance when NOT stemming with a large context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 2, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))

print("Performance when stemming with a small context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=True, cluster_method="agglomerative_complete", verbose=False
              ))

print("Performance when NOT stemming with a small context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens,
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))

Stemming experiments:
Performance when stemming with a large context window: ('Average accuracy:', 0.6359999999999999, 'Standard deviation:', 0.030724582991474434)
Performance when NOT stemming with a large context window: ('Average accuracy:', 0.669, 'Standard deviation:', 0.032542280190545954)
Performance when stemming with a small context window: ('Average accuracy:', 0.8418367346938774, 'Standard deviation:', 0.026510981748503214)
Performance when NOT stemming with a small context window: ('Average accuracy:', 0.8642857142857142, 'Standard deviation:', 0.02261073449608197)


In [11]:
print("PMI vs PPMI vs PPMI with smoothing experiments")

print("Performance with pmi stemming with a large context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 2, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="pmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))

print("Performance with ppmi with a large context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 2, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))

print("Performance with ppmi with smoothing with a large context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "sentence", 
              context_window = 2, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative", verbose=False
              ))

print("Performance with pmi with a small context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="pmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
            ))


print("Performance with ppmi with a small context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))


print("Performance with ppmi with smoothing with a small context window:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 20,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=False
              ))



PMI vs PPMI vs PPMI with smoothing experiments
Performance with pmi stemming with a large context window: ('Average accuracy:', 0.6700000000000002, 'Standard deviation:', 0.040249223594996206)
Performance with ppmi with a large context window: ('Average accuracy:', 0.701, 'Standard deviation:', 0.03064310689208911)
Performance with ppmi with smoothing with a large context window: ('Average accuracy:', 0.6779999999999998, 'Standard deviation:', 0.03280243893371344)
Performance with pmi with a small context window: ('Average accuracy:', 0.8561224489795919, 'Standard deviation:', 0.03059523336849989)
Performance with ppmi with a small context window: ('Average accuracy:', 0.8642857142857142, 'Standard deviation:', 0.03244512876649928)
Performance with ppmi with smoothing with a small context window: ('Average accuracy:', 0.8561224489795919, 'Standard deviation:', 0.027739341263404448)


In [12]:
# running for a single iteration so as to show how usually the clusters with this method look like
print("Clusters with best performing method:",
        run_experiment(generate_reverse_word_corpus = True, context_type = "words", 
              context_window = 1, original_most_frequent_tokens = most_frequent_tokens, 
              key_words = cluster_words, similiarity_measure="smooth_ppmi", iterations = 1,
              remove_stopwords=False, stem=False, cluster_method="agglomerative_complete", verbose=True
              ))



{'tnaw', 'want'}
{'tcudorp', 'ipod'}
{'computer', 'retupmoc'}
{'tsrif', 'first'}
{'esu', 'get', 'use', 'teg'}
{'much', 'hcum'}
{'works', 'skrow', 'work', 'krow'}
{'life', 'efil'}
{'neve', 'even'}
{'sound', 'dnuos'}
{'osla', 'also'}
{'evitaerc', 'creative'}
{'easy', 'ysae'}
{'llew', 'well'}
{'dluow', 'would'}
{'pictures', 'serutcip'}
{'erawtfos', 'software'}
{'diaper', 'repaid'}
{'ecirp', 'price'}
{'buy', 'yub'}
{'problem', 'melborp'}
{'size', 'ezis'}
{'dopi', 'reyalp', 'player'}
{'aremac', 'camera'}
{'notron', 'norton'}
{'really', 'yllaer'}
{'orcim'}
{'time', 'emit'}
{'ekil', 'like'}
{'yrettab', 'battery'}
{'deen', 'need'}
{'phone', 'enohp'}
{'cisum', 'music'}
{'good', 'doog', 'taerg', 'great'}
{'desu', 'used'}
{'zen', 'nez'}
{'llits'}
{'little', 'elttil'}
{'features', 'serutaef'}
{'ytilauq', 'quality'}
{'gnisu'}
{'product'}
{'router', 'retuor'}
{'using'}
{'still'}
{'better', 'retteb'}
{'eno', 'one'}
{'pmahc'}
{'champ'}
{'micro'}
[0.87755102]
Clusters with best performing method: ('Ave