In [1]:
# Functions for removing noise and garbage of the data
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation

# Number of train/test files
num_train_file = 10000
num_test_file = 2000

def strip_punctuation(data):
    for i in range(len(data)):
        data[i]=''.join([letter for letter in data[i] if letter not in punctuation])
    return data

def lowercase(data):
    return [x.lower() for x in data]

stop_words = set(stopwords.words('english'))
def strip_stop_words(data):
    for i in range(len(data)):
        word_tokens = word_tokenize(data[i])
        filtered_sentence = []
        for word in word_tokens:
            if word not in stop_words:
                filtered_sentence.append(word)
        data[i] = " ".join(filtered_sentence)
    return data

def lemmatizer(data):
    lemmatizer = WordNetLemmatizer()
    for i in range(len(data)):
        word_list = data[i].split(' ')
        temp = []
        for word in word_list:
            temp.append(lemmatizer.lemmatize(word, 'v'))
        data[i] = " ".join(temp)
    return data

In [2]:
def generate_unique_words_dict(desc_train_path):
    
    unique_words_dict = []
    for i in range(num_train_file):
        filename = desc_train_path + str(i) +'.txt'
        
        # Read files.
        file = open(filename, "r")
        contents = [line.rstrip("\n") for line in file]
        
        # Lowercase all of the words.
        contents = lowercase(contents)
        
        # Strip punctuation
        contents = strip_punctuation(contents)
        
        # Strip the stop words
        contents = strip_stop_words(contents)
        
        # Lemmatization of all the words
        contents = lemmatizer(contents)
        
        for content in contents:
            words = content.split()
            for word in words:
                unique_words_dict.append(word)
    
    return list(set(unique_words_dict))
        
unique_words_dict = generate_unique_words_dict('./data/descriptions_train/')

In [3]:
def generate_bag_of_words(file_path, num_file):
    bag_of_words = {}
    count = 0
    for i in range(num_file):
        filename = file_path + str(i) +'.txt'
        
        # Read files.
        file = open(filename, "r")
        contents = [line.rstrip("\n") for line in file]        
        
        # Create a list with contents
        temp = ''
        for content in contents:
            temp += ' ' + content
        contents = [temp]
    
        # Lowercase all of the words.
        contents = lowercase(contents)
        
        # Strip punctuation
        contents = strip_punctuation(contents)
        
        # Strip the stop words
        contents = strip_stop_words(contents)
        
        # Lemmatization of all the words
        contents = lemmatizer(contents)
        
        for content in contents:
            words = content.split()
            counts = []
            for word in unique_words_dict:
                counts.append(words.count(word))
            
            new_file_name = filename.split('/')[-1]
            bag_of_words[new_file_name] = counts
            
        if count % 1000 == 0:
            print('Progress: ' + str(count) + '/' + str(num_file))
        count += 1
            
    return bag_of_words
        
training_bag_of_words = generate_bag_of_words('./data/descriptions_train/', num_train_file)
test_bag_of_words = generate_bag_of_words('./data/descriptions_test/', num_test_file)

Progress: 0/10000
Progress: 1000/10000
Progress: 2000/10000
Progress: 3000/10000
Progress: 4000/10000
Progress: 5000/10000
Progress: 6000/10000
Progress: 7000/10000
Progress: 8000/10000
Progress: 9000/10000
Progress: 0/2000
Progress: 1000/2000


In [4]:
# Read train/test features.
train_features = []
test_features = []

train_feature_file = open("./data/features_train/features_resnet1000_train.csv", "r")
train_features = np.array([line.split(",") for line in train_feature_file])

test_feature_file = open("./data/features_test/features_resnet1000_test.csv", "r")
test_features = np.array([line.split(",") for line in test_feature_file])

In [5]:
# Create knn model for training descriptions
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=1)
X, y = [], []
for filename in training_bag_of_words:
    X.append(training_bag_of_words[filename])
    y.append(filename)

X = np.array(X)
y = np.array(y)
knn_classifier.fit(X, y)

# def process_knn(desc_test_file):

#     knn_classifier = KNeighborsClassifier(n_neighbors=1)
#     X, y = [], []
#     for filename in training_bag_of_words:
#         X.append(training_bag_of_words[filename])
#         y.append(filename)

#     X = np.array(X)
#     y = np.array(y)
#     knn_classifier.fit(X, y)
    
#     bag_of_word = np.array(test_bag_of_words[desc_test_file])
#     train_file_name = knn_classifier.predict(bag_of_word.reshape(1, -1))
    
#     return train_file_name[0]

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [6]:
# Create train feature dictionary
train_feature_dict = {}
for i  in range(len(train_features)):
    file_name = train_features[i][0] # 'images_train/5373.jpg'
    file_name = file_name.split('/')[1].split('.')[0] # 5373
    train_feature_dict[file_name] = np.array(train_features[i][1:])

# def get_train_feature(train_file_name):
    
#     train_file_name = train_file_name.split('.')[0]
    
#     # Return train feature with train file name
#     return train_feature_dict[train_file_name]

In [7]:
# reference: https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
def get_euclidean_distance(a, b):
    a = np.array(a, dtype=float)
    b = np.array(b, dtype=float)
    return np.linalg.norm(a - b)

In [8]:
# reference: https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value
import operator

def get_top_20_images(desc_test_file):    
    bag_of_word = np.array(test_bag_of_words[desc_test_file])
    train_file_name = knn_classifier.predict(bag_of_word.reshape(1, -1)) # ['1234.jpg']
#     train_file_name = process_knn(desc_test_file)
    selected_train_feature = train_feature_dict[train_file_name[0].split('.')[0]]
#     selected_train_feature = get_train_feature(train_file_name)
    
    feature_dist = {}
    for test_feature in test_features:
        test_file_name = test_feature[0].split("/")[1]
        dist = get_euclidean_distance(selected_train_feature, test_feature[1:])
        feature_dist[test_file_name] = dist
    sorted_feature_dist = sorted(feature_dist.items(), key=operator.itemgetter(1))
    top_20 = []
    for i in range(20):
        top_20.append(sorted_feature_dist[i][0])

    return np.array(top_20)

# top_20 = get_top_20_images("5.txt")
# print(top_20)

In [9]:
# Reference: https://stackoverflow.com/questions/42812230/why-plt-imshow-doesnt-display-the-image
import matplotlib.pyplot as plt
from PIL import Image

def display_top_20():
    for image_name in top_20:
        image = Image.open('./data/images_test/' + image_name, 'r')
        plt.imshow(np.asarray(image))
        plt.show()
        
# display_top_20()

In [10]:
import time
# dic = {"John": "john@example.com", "Mary": "mary@example.com"} #dictionary
count = 0
result_dic = {}
start_time = time.time()

for i in range(2000):
    desc_test_name = str(i) + '.txt'
    top_20_imgs = " ".join(get_top_20_images(desc_test_name))
    result_dic[desc_test_name] = top_20_imgs
    
    if count % 50 == 0:
        print('Progress: ' + str(count) + '/' + str(2000))
    count += 1
    
print(str(time.time() - start_time) + ' seconds')

Progress: 0/2000
Progress: 50/2000
Progress: 100/2000
Progress: 150/2000
Progress: 200/2000
Progress: 250/2000
Progress: 300/2000
Progress: 350/2000
Progress: 400/2000
Progress: 450/2000
Progress: 500/2000
Progress: 550/2000
Progress: 600/2000
Progress: 650/2000
Progress: 700/2000
Progress: 750/2000
Progress: 800/2000
Progress: 850/2000
Progress: 900/2000
Progress: 950/2000
Progress: 1000/2000
Progress: 1050/2000
Progress: 1100/2000
Progress: 1150/2000
Progress: 1200/2000
Progress: 1250/2000
Progress: 1300/2000
Progress: 1350/2000
Progress: 1400/2000
Progress: 1450/2000
Progress: 1500/2000
Progress: 1550/2000
Progress: 1600/2000
Progress: 1650/2000
Progress: 1700/2000
Progress: 1750/2000
Progress: 1800/2000
Progress: 1850/2000
Progress: 1900/2000
Progress: 1950/2000
12317.41633439064 seconds


In [46]:
# Reference: https://pythonspot.com/save-a-dictionary-to-a-file/
import csv

w = csv.writer(open("submission1.csv", "w", newline=''))
w.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
for key, val in result_dic.items():
    w.writerow([key, val])
    
# # Reference: http://stanford.edu/~mgorkove/cgi-bin/rpython_tutorials/Writing_Data_to_a_CSV_With_Python.php
# import csv

# download_dir = "submission.csv" #where you want the file to be downloaded to 
# csv = open(download_dir, "w") #"w" indicates that you're writing strings to the file

# columnTitleRow = "Descritpion_ID,Top_20_Image_IDs\n"
# csv.write(columnTitleRow)

# for key in result_dic.keys():
#     desc_id = key
#     top_20_img_id = result_dic[key]
#     row = desc_id + "," + top_20_img_id + "\n"
#     csv.write(row)