In [4]:
import pandas as pd
import numpy as np

ResNet = pd.read_csv('./data/features_train/features_resnet1000_train.csv', header=None)
ResNet.columns = ['fnum'] + list(range(1000))
ResNet['fnum'] = ResNet['fnum'].apply(lambda x: int(x.split('/')[-1].split('.')[0]))
ResNet.sort_values('fnum', inplace = True)
ResNet.set_index('fnum', inplace = True)

# Manual implementation of Softmax
# Matrix of probabilities

# img num 1: [ P(class=1), P(class=2), ...]
# img num 2: [ P(class=1), P(class=2), ...]
probabilities = np.exp(ResNet.values)/np.exp(ResNet.values).sum(axis=1, keepdims=True)

In [5]:
from glob import glob
from functools import reduce
from collections import defaultdict, Counter

def counter_to_df_row(counter, index):
    if bool(counter):
        row = pd.DataFrame.from_dict(counter, orient='index').transpose()
        row.index = [index]
        return row
    else:
        return pd.DataFrame(index=[index])

def get_tags_from(fname):
    with open(fname) as f:
        fnum = int(fname.split('/')[-1].split('.')[0])
        tags = f.read().splitlines()
        categories = Counter([tag.split(':')[0] for tag in tags])
        subcategories = Counter([tag.split(':')[1] for tag in tags])
        
        cat_row = counter_to_df_row(categories, fnum)
        subcat_row = counter_to_df_row(subcategories, fnum)
        
    return cat_row, subcat_row
        

files = glob('./data/tags_train/*')
all_tags = [get_tags_from(file) for file in files]
cats, subcats = tuple(zip(*all_tags))

cats = reduce(lambda x, y: x.append(y), cats)
cats.fillna(0, inplace=True)
cats = cats.sort_index()

subcats = reduce(lambda x, y: x.append(y), subcats)
subcats.fillna(0, inplace=True)
subcats = subcats.sort_index()

In [20]:
descriptions = pd.read_csv("tokens.csv")
descriptions = descriptions.set_index('Unnamed: 0')


TypeError: unhashable type: 'slice'

In [17]:
descriptions_test = pd.read_csv("./tokens_test.csv")
descriptions_test = descriptions_test.set_index('Unnamed: 0')

In [18]:
# Convert into vector of tokens that map to our training data[]
diff_cols = [col for col in descriptions.columns if col not in descriptions_test.columns]
for col in diff_cols:
    descriptions_test[col] = 0

descriptions_test = descriptions_test[descriptions.columns]
descriptions_test.shape

(2000, 5814)

In [21]:
from sklearn.naive_bayes import MultinomialNB

def get_probability_matrix(descriptions_matrix, category_df):
    classifiers = list()
    category_probabilities = np.zeros((len(descriptions_matrix.values[0,:]),len(category_df.columns)))
    token_vectors = np.diag(np.ones(len(descriptions_matrix.values[0,:])))

    for i in range(len(category_df.columns)):
        col = category_df[category_df.columns[i]]
        flattened = np.array((col > 0).astype(int))
        classifiers.append(MultinomialNB().fit(descriptions_matrix, flattened))
        predictions = classifiers[i].predict_proba(token_vectors)
        category_probabilities[:,i] = predictions[:,1]
    
    return category_probabilities, classifiers

cat_probs, cat_clfs = get_probability_matrix(descriptions, cats)
subcat_probs, subcat_clfs = get_probability_matrix(descriptions, subcats)

In [27]:
from sklearn.preprocessing import normalize

cat_probs_per_token = normalize(descriptions_test.values).dot(cat_probs)
subcat_probs_per_token = normalize(descriptions_test.values).dot(subcat_probs)

In [28]:
from glob import glob
from functools import reduce
from collections import defaultdict, Counter

# Calculate Similarities to the Image Tag and Subtag vectors
# First get tag vector data

files = glob('./data/tags_test/*')
all_tags = [get_tags_from(file) for file in files]
cats, subcats = tuple(zip(*all_tags))

cats = reduce(lambda x, y: x.append(y), cats)
cats.fillna(0, inplace=True)
cats = cats.sort_index()
cats = normalize(cats)

subcats = reduce(lambda x, y: x.append(y), subcats)
subcats.fillna(0, inplace=True)
subcats = subcats.sort_index()
subcats = normalize(subcats)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances


# For each testing vector, find 20 nearest neighbors
similarities = cosine_similarity(cat_probs_per_token, cats)
# x is test sentence
# y is the image number

sort_indices = similarities.argsort(axis=1)

results = {}
for i in range(2000): # Each test
    top_20 = [
        str(np.where(sort_indices[i,:] == j)[0][0]) 
        for j in range(20)
    ]
    results[str(i)+'.txt'] = '.jpg '.join(top_20) + '.jpg'

# Create Submission
results = pd.DataFrame.from_dict(results, orient='index').reset_index()
results.columns = ['Descritpion_ID','Top_20_Image_IDs']
results.to_csv('submission2.csv', index=False,index_label=False)

## Subcats

In [125]:
from sklearn.metrics.pairwise import cosine_similarity

# For each testing vector, find 20 nearest neighbors
similarities = cosine_similarity(subcat_probs_per_token, subcats)
# x is test sentence
# y is the image number

sort_indices = similarities.argsort(axis=1)

results = {}
for i in range(2000): # Each test
    top_20 = [
        str(np.where(sort_indices[i,:] == j)[0][0]) for j in range(20)]
    results[str(i)+'.txt'] = '.jpg '.join(top_20) + '.jpg'

# Create Submission
results = pd.DataFrame.from_dict(results, orient='index').reset_index()
results.columns = ['Descritpion_ID','Top_20_Image_IDs']
results.to_csv('submission.csv', index=False,index_label=False)

In [1]:
def similarity_to_results(similarities_matrix, fname):
    sort_indices = (similarities_matrix*-1).argsort(axis=1)

    results = {}
    for i in range(2000): # Each test
        top_20 = [
            str(np.where(sort_indices[i,:] == j)[0][0]) for j in range(20)]
        results[str(i)+'.txt'] = '.jpg '.join(top_20) + '.jpg'

    # Create Submission
    results = pd.DataFrame.from_dict(results, orient='index').reset_index()
    results.columns = ['Descritpion_ID','Top_20_Image_IDs']
    results.to_csv(fname, index=False,index_label=False)

In [141]:
description_categories = np.concatenate([subcat_probs_per_token,cat_probs_per_token], axis=1)
image_categories = np.concatenate([subcats,cats], axis=1)

similarities = cosine_similarity(description_categories, image_categories)

In [143]:
similarity_to_results(similarities, 'submission.csv')

In [None]:


def sort(matrix):
    A = matrix.sort(axis=1)
        
