In [2]:
import pandas as pd
from glob import glob
from functools import reduce
from collections import defaultdict, Counter

def counter_to_df_row(counter, index):
    if bool(counter):
        row = pd.DataFrame.from_dict(counter, orient='index').transpose()
        row.index = [index]
        return row
    else:
        return pd.DataFrame(index=[index])

def get_tags_from(fname):
    with open(fname) as f:
        fnum = int(fname.split('/')[-1].split('.')[0])
        tags = f.read().splitlines()
        categories = Counter([tag.split(':')[0] for tag in tags])
        subcategories = Counter([tag.split(':')[1] for tag in tags])
        
        cat_row = counter_to_df_row(categories, fnum)
        subcat_row = counter_to_df_row(subcategories, fnum)
        
    return cat_row, subcat_row
        

files = glob('./data/tags_train/*')
all_tags = [get_tags_from(file) for file in files]
cats, subcats = tuple(zip(*all_tags))

cats = reduce(lambda x, y: x.append(y), cats)
cats.fillna(0, inplace=True)
cats = cats.sort_index()

subcats = reduce(lambda x, y: x.append(y), subcats)
subcats.fillna(0, inplace=True)
subcats = subcats.sort_index()

In [3]:
descriptions = pd.read_csv("./pipeline/data/tokens_train.csv")
descriptions = descriptions.set_index('Unnamed: 0')
descriptions_test = pd.read_csv("./pipeline/data/tokens_test.csv")
descriptions_test = descriptions_test.set_index('Unnamed: 0')

# Convert into vector of tokens that map to our training data[]
diff_cols = [col for col in descriptions.columns if col not in descriptions_test.columns]
for col in diff_cols:
    descriptions_test[col] = 0

descriptions_test = descriptions_test[descriptions.columns]

In [5]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import normalize

def get_probability_matrix(descriptions_matrix, category_df):
    classifiers = list()
    category_probabilities = np.zeros((len(descriptions_matrix[0,:]),len(category_df.columns)))
    token_vectors = np.diag(np.ones(len(descriptions_matrix[0,:])))

    for i in range(len(category_df.columns)):
        col = category_df[category_df.columns[i]]
        flattened = np.array((col > 0).astype(int))
        classifiers.append(MultinomialNB().fit(descriptions_matrix, flattened))
        predictions = classifiers[i].predict_proba(token_vectors)
        category_probabilities[:,i] = predictions[:,1]
    
    return category_probabilities, classifiers

cat_probs, cat_clfs = get_probability_matrix(descriptions.values, cats)
subcat_probs, subcat_clfs = get_probability_matrix(descriptions.values, subcats)
cat_probs_per_token = normalize(descriptions_test.values).dot(cat_probs)
subcat_probs_per_token = normalize(descriptions_test.values).dot(subcat_probs)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

description_categories = np.concatenate([subcat_probs_per_token,cat_probs_per_token], axis=1)
image_categories = np.concatenate([subcats,cats], axis=1)

similarities = cosine_similarity(description_categories, image_categories)

In [19]:
def reverse(matrix):
    A = np.zeros(matrix.shape)
    for i in range(len(matrix[:,0])):
        row_order_indices = similarities[i].argsort() 
        A[i] = row_order_indices[::-1]
    return A

def similarity_to_results(similarities_matrix, fname):
    sort_indices = similarities_matrix.argsort(axis=1)

    results = {}
    for i in range(2000): # Each test
        top_20 = [
            str(np.where(sort_indices[i,:] == j)[0][0]) for j in range(20)]
        results[str(i)+'.txt'] = '.jpg '.join(top_20) + '.jpg'

    # Create Submission
    results = pd.DataFrame.from_dict(results, orient='index').reset_index()
    results.columns = ['Descritpion_ID','Top_20_Image_IDs']
    results.to_csv(fname, index=False,index_label=False)

similarity_to_results(similarities, 'submission.csv')