In [36]:
# Functions for removing noise and garbage of the data
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation

# Number of train/test files
num_train_file = 10000
num_test_file = 2000

def strip_punctuation(data):
    for i in range(len(data)):
        data[i]=''.join([letter for letter in data[i] if letter not in punctuation])
    return data

def lowercase(data):
    return [x.lower() for x in data]

stop_words = set(stopwords.words('english'))
def strip_stop_words(data):
    for i in range(len(data)):
        word_tokens = word_tokenize(data[i])
        filtered_sentence = []
        for word in word_tokens:
            if word not in stop_words:
                filtered_sentence.append(word)
        data[i] = " ".join(filtered_sentence)
    return data

def lemmatizer(data):
    lemmatizer = WordNetLemmatizer()
    for i in range(len(data)):
        word_list = data[i].split(' ')
        temp = []
        for word in word_list:
            temp.append(lemmatizer.lemmatize(word, 'v'))
        data[i] = " ".join(temp)
    return data

In [37]:
def generate_unique_words_dict(desc_train_path):
    
    unique_words_dict = []
    for i in range(num_train_file):
        filename = desc_train_path + str(i) +'.txt'
        
        # Read files.
        file = open(filename, "r")
        contents = [line.rstrip("\n") for line in file]
        
        # Lowercase all of the words.
        contents = lowercase(contents)
        
        # Strip punctuation
        contents = strip_punctuation(contents)
        
        # Strip the stop words
        contents = strip_stop_words(contents)
        
        # Lemmatization of all the words
        contents = lemmatizer(contents)
        
        for content in contents:
            words = content.split()
            for word in words:
                unique_words_dict.append(word)
    
    return list(set(unique_words_dict))
        
unique_words_dict = generate_unique_words_dict('./data/descriptions_train/')

In [38]:
def generate_bag_of_words(file_path, num_file):
    bag_of_words = {}
    for i in range(num_file):
        filename = file_path + str(i) +'.txt'
        
        # Read files.
        file = open(filename, "r")
        contents = [line.rstrip("\n") for line in file]        
        
        # Create a list with contents
        temp = ''
        for content in contents:
            temp += ' ' + content
        contents = [temp]
    
        # Lowercase all of the words.
        contents = lowercase(contents)
        
        # Strip punctuation
        contents = strip_punctuation(contents)
        
        # Strip the stop words
        contents = strip_stop_words(contents)
        
        # Lemmatization of all the words
        contents = lemmatizer(contents)
        
        for content in contents:
            words = content.split()
            counts = []
            for word in unique_words_dict:
                counts.append(words.count(word))
            
            new_file_name = filename.split('/')[-1]
            bag_of_words[new_file_name] = counts
        
        if i==999 or i==1999 or i==2999 or i==3999 or i==4999 or i==5999 or i==6999 or i==7999 or i==8999 or i==9999:
            print('Progress: ' + str(i) + '/' + str(num_file-1))
            
    return bag_of_words
        
training_bag_of_words = generate_bag_of_words('./data/descriptions_train/', num_train_file)
test_bag_of_words = generate_bag_of_words('./data/descriptions_test/', num_test_file)

Progress: 999/9999
Progress: 1999/9999
Progress: 2999/9999
Progress: 3999/9999
Progress: 4999/9999
Progress: 5999/9999
Progress: 6999/9999
Progress: 7999/9999
Progress: 8999/9999
Progress: 9999/9999
Progress: 999/1999
Progress: 1999/1999


In [None]:
train_features = []
test_features = []

# Read train/test features.
train_feature_file = open("./data/features_train/features_resnet1000_train.csv", "r")
train_features = np.array([line.split(",") for line in train_feature_file])

test_feature_file = open("./data/features_test/features_resnet1000_test.csv", "r")
test_features = np.array([line.split(",") for line in test_feature_file])

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def process_knn(desc_test_file):

    knn_classifier = KNeighborsClassifier(n_neighbors=1)
    X, y = [], []
    for filename in training_bag_of_words:
        X.append(training_bag_of_words[filename])
        y.append(filename)

    X = np.array(X)
    y = np.array(y)
    knn_classifier.fit(X, y)
    
    bag_of_word = np.array(test_bag_of_words[desc_test_file])
    train_file_name = knn_classifier.predict(bag_of_word.reshape(1, -1))
    
    return train_file_name[0]

In [None]:
def get_train_feature(train_file_name):
    
    # Create train feature dictionary
    train_feature_dict = {}
    for i  in range(len(train_features)):
        file_name = train_features[i][0] # 'images_train/5373.jpg'
        file_name = file_name.split('/')[1].split('.')[0] # 5373
        train_feature_dict[file_name] = np.array(train_features[i][1:])
    
    train_file_name = train_file_name.split('.')[0]
    
    # Return train feature with train file name
    return train_feature_dict[train_file_name]

In [None]:
# reference: https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
def get_euclidean_distance(a, b):
    a = np.array(a, dtype=float)
    b = np.array(b, dtype=float)
    return np.linalg.norm(a - b)

In [None]:
# reference: https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value
import operator
def get_top_20_images(desc_test_file):
    
    train_file_name = process_knn(desc_test_file)
    selected_train_feature = get_train_feature(train_file_name)
    
    print('Train file name: ' + train_file_name)
    
    feature_dist = {}
    for test_feature in test_features:
        test_file_name = test_feature[0].split("/")[1]
        dist = get_euclidean_distance(selected_train_feature, test_feature[1:])
        feature_dist[test_file_name] = dist
    sorted_feature_dist = sorted(feature_dist.items(), key=operator.itemgetter(1))
    top_20 = []
    for i in range(20):
        top_20.append(sorted_feature_dist[i][0])

    return np.array(top_20)

top_20 = get_top_20_images("5.txt")
print(top_20)

In [None]:
# Reference: https://stackoverflow.com/questions/42812230/why-plt-imshow-doesnt-display-the-image
import matplotlib.pyplot as plt
from PIL import Image

def display_top_20():
    for image_name in top_20:
        image = Image.open('./data/images_test/' + image_name, 'r')
        plt.imshow(np.asarray(image))
        plt.show()
        
display_top_20()