In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from joblib import dump, load
from datasketch import MinHash, MinHashLSH
import hashlib
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [3]:
df_train = pd.read_csv("imdb_train.csv")
df_test = pd.read_csv("imdb_test_without_labels.csv")

In [4]:
def clean(reviews):
    
    """Initialize an empty list to hold the clean reviews"""
    clean_train_reviews = []

    # Loop over each review in the list
    for index, review in enumerate(reviews):
        # Call the pre processer for each review, and add the result to the list of clean reviews
        clean_train_reviews.append(preProcess(review))
    
    return clean_train_reviews
 
def preProcess(rawReview):

    """Function to convert a raw review to a string of words
        Takes in a raw movie review as a single string to output a preprocessed movie review as a single string"""

    text_only = re.sub('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '', rawReview)
    #
    # 2. Remove Email IDs, URLs and numbers
    noEmail = re.sub(r'([\w\.-]+@[\w\.-]+\.\w+)','',text_only)
    
    noUrl = re.sub(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]| \
        [a-z0-9.\-]+[.][a-z]{2,4}/|[a-z0-9.\-]+[.][a-z])(?:[^\s()<>]+|\(([^\s()<>]+| \
        (\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))','', noEmail)
    
    #Emotional symbols may affect the meaning of the review
    smileys = """:-) :) :o) :D :-D :( :-( :o(""".split()
    smileyPattern = "|".join(map(re.escape, smileys))
    
    letters_only = re.sub("[^a-zA-Z" + smileyPattern + "]", " ", noUrl)
    #
    # 3. Convert to lower case and split into individual words
    words = letters_only.lower().split()     
    #
    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words and also 3-letter words and Lemmatize the review
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = ''
    for word in words:
        if word not in stops and len(word) > 3:
        #if len(word) > 3:
            lemmatized_words += str(lemmatizer.lemmatize(word)) + ' '
    #
    # 6. Join the words back into one string separated by space and return the result.
    return lemmatized_words



In [5]:
clean_train_review = clean(df_train['review'])
clean_test_review = clean(df_test['review'])

In [6]:
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

In [7]:
df_train_clean = df_train 
df_train_clean['review'] = clean_train_review
df_train_clean['review'] = df_train_clean['review'].apply(remove_special_characters)
df_test_clean = df_test
df_test_clean['review'] = clean_test_review
df_test_clean['review'] = df_test_clean['review'].apply(remove_special_characters)

In [8]:
cnt_vectorizer = CountVectorizer(max_features=5000)

In [103]:
X_train_features = cnt_vectorizer.fit_transform(df_train_clean['review'][:10000]) #40000, 5000
X_test_features = cnt_vectorizer.transform(df_test_clean['review'])

<h3>Brute Force KNN</h3>

In [104]:
X_train_features = X_train_features.toarray()
X_test_features = X_test_features.toarray()

In [107]:
knn =  NearestNeighbors(n_neighbors=15,algorithm='brute', metric="jaccard")
knn_brute = knn.fit(X_train_features, df_train_clean['sentiment'].to_numpy())

In [108]:
import warnings
warnings.filterwarnings('ignore')

distances, indices = knn.kneighbors(X_test_features)

In [126]:
print(df_train.iloc[indices[5][14]]['id']) #2913

18368


Now in the results_dict variable we will store to each test id (key of dictionary) a list containing id's of the neighbors that predicted from the model

In [80]:
results_dict = {}

for i, idn in enumerate(df_test['id']):
  results_dict[idn] = [df_train.iloc[indices[i][j]]['id'] for j in range(15)]


<h3>LSH-MinHash</h3>

In [118]:
lsh = MinHashLSH(threshold=0.5, num_perm=50)

Create hashes for the train set

In [119]:
# building time between 45-50 minutes
train_hashes = {}
i = 0
for id, row in zip(df_train_clean['id'][:10000], X_train_features):
    m_temp = MinHash(num_perm=50)
    for val in row:
        m_temp.update(val)
    train_hashes[id] = [m_temp, df_train_clean.iloc[i]['review'], df_train_clean.iloc[i]['sentiment']]
    i += 1 

In [120]:
for key in train_hashes.keys():
    lsh.insert(key, train_hashes[key][0])

In [121]:
test_hashes = {}
i = 0
for id, row in zip(df_test_clean['id'], X_test_features):
    m_temp = MinHash(num_perm=50)
    for val in row:
        m_temp.update(val)
    test_hashes[id] = [m_temp, df_test_clean.iloc[i]['review']]
    i += 1

In [122]:
test_query = [lsh.query(test_hashes[hash][0]) for hash in test_hashes.keys()] # 151 non empty lists

<h4>Now we will find the closest neighbors to each test example</h4>

In [123]:
import time
import warnings
warnings.filterwarnings('ignore')

"""
Take each nested list from the test_query that contains the neighbor hashes(id's).
Take the reviews associated with each id that is neighbor to that row.
Transform the reviews to tfidf
"""
idx = 0
results = {} # key is the test id and values are the id's (hashes) predicted
for row in test_query:
    if len(row) == 0:
        results[df_test_clean.iloc[idx]['id']] = 0 
        idx += 1
        continue
    if len(row) >= 15:
        knn_temp = NearestNeighbors(n_neighbors=15,algorithm='brute', metric='jaccard')
        df_temp = df_train_clean.loc[df_train_clean['id'].isin(row)] # get rows with specific id's that lsh query returned for the specific example
        X_train_features_temp = cnt_vectorizer.transform(df_temp['review'])
        X_train_features_temp = X_train_features_temp.toarray()

        if X_train_features_temp.shape[1] < 5000:
           results[df_test_clean.iloc[idx]['id']] = 0
           idx += 1
           continue

        knn_temp.fit(X_train_features_temp, df_temp['sentiment'].to_numpy())
        distances, indices = knn_temp.kneighbors([X_test_features[idx]])
        if len(indices[0]) >= 15:
            results[df_test_clean.iloc[idx]['id']] = [df_train.iloc[indices[0][j]]['id'] for j in range(15)]
        else:
            results[df_test_clean.iloc[idx]['id']] = [df_train.iloc[indices[0][j]]['id'] for j in range(len(indices))]
        idx += 1

In [None]:
for key in results.keys():
    if results[key] != 0:
        print(f'key {key} result {results[key]}')
    break

key 33527 result [37355, 9022, 43936, 6752, 20693, 45687, 31907, 43190, 9333, 17819, 4185, 17399, 11796, 21506, 40878]


In [124]:

results_dict_ls = []
for key in results_dict.keys():
    if type(results_dict[key]) == int:
        continue
    for val in results_dict[key]:
        results_dict_ls.append(val)

results_ls = []
for key in results.keys():
    if type(results[key]) == int:
        continue
    for val in results[key]:
        results_ls.append(val)

In [125]:
res = len(set(results_ls) & set(results_dict_ls)) / float(len(set(results_ls) | set(results_dict_ls))) * 100
  
# printing result
print("Percentage similarity among lists is : " + str(res))

Percentage similarity among lists is : 19.07001441250095


<h3>Random projection</h3>

In [91]:
nbits = 4 # buckets 
d = 5000 # dimension 
plane_norms = np.random.rand(nbits, d) - .5 # hyperplaens centered around 0

In [92]:
train_matrix_dot_products = np.dot(X_train_features, plane_norms.T)
test_matrix_dot_products = np.dot(X_test_features, plane_norms.T)

In [93]:
train_matrix_dot_products = [row > 0 for row in train_matrix_dot_products]
test_matrix_dot_products = [row > 0 for row in test_matrix_dot_products]


In [94]:
not_false = [row for row in train_matrix_dot_products if True in row]
len(not_false)

38405

In [95]:
train_matrix_dot_products = [row.astype(int) for row in train_matrix_dot_products]
test_matrix_dot_products = [row.astype(int) for row in test_matrix_dot_products]

In [96]:
buckets_train = {}
for i in range(len(train_matrix_dot_products)):
    hash_str = ''.join(train_matrix_dot_products[i].astype(str))
    if hash_str not in buckets_train.keys():
        buckets_train[hash_str] = []
    # Μπαίνει το index των σειρών στην λίστα,    
    #οπότε οι σειρές αυτές που έχουν το ίδιο κλειδί hash είναι γείτονεσ 
    buckets_train[hash_str].append(i)  

In [97]:
buckets_test = {}
for i in range(len(test_matrix_dot_products)):
    hash_str = ''.join(test_matrix_dot_products[i].astype(str))
    if hash_str not in buckets_test.keys():
        buckets_test[hash_str] = []
    # Μπαίνει το index των σειρών στην λίστα,    
    #οπότε οι σειρές αυτές που έχουν το ίδιο κλειδί hash είναι γείτονεσ 
    buckets_test[hash_str].append(i) 

In [98]:

results_random = {}
for key in buckets_test.keys():
    try:
        # if the key also exists in train buckets get the index
        # of the rows that ended up in the same bucket
        indexes = buckets_train[key] 
    except:
        continue 

    df_temp = df_train_clean.iloc[indexes] # get the rows with the indexes list
    X_train_features_temp = cnt_vectorizer.transform(df_temp['review'])
    X_train_features_temp = X_train_features_temp.toarray()

    if X_train_features_temp.shape[1] < 5000:
        continue
    for idx in buckets_test[key]:
        knn_temp = NearestNeighbors(n_neighbors=len(indexes),algorithm='brute', metric='cosine')
        knn_temp.fit(X_train_features_temp, df_temp['sentiment'].to_numpy())
        distances, indices = knn_temp.kneighbors([X_test_features[idx]])
        if len(indices[0]) >= 15:
            results_random[df_test_clean.iloc[idx]['id']] = [df_train.iloc[indices[0][j]]['id'] for j in range(15)]
        else:
            results_random[df_test_clean.iloc[idx]['id']] = [df_train.iloc[indices[0][j]]['id'] for j in range(len(indices))]
        
    

In [101]:

results_dict_ls = []
for key in results_dict.keys():
    if type(results_dict[key]) == int:
        continue
    for val in results_dict[key]:
        results_dict_ls.append(val)

results_ls = []
for key in results_random.keys():
    if type(results_random[key]) == int:
        continue
    for val in results_random[key]:
        results_ls.append(val)

In [102]:
res = len(set(results_ls) & set(results_dict_ls)) / float(len(set(results_ls) | set(results_dict_ls))) * 100
  
# printing result
print("Percentage similarity among lists is : " + str(res))

Percentage similarity among lists is : 15.241081928655428
