In [1]:
import pandas as pd
import numpy as np

ResNet = pd.read_csv('./data/features_train/features_resnet1000_train.csv', header=None)
ResNet.columns = ['fnum'] + list(range(1000))
ResNet['fnum'] = ResNet['fnum'].apply(lambda x: int(x.split('/')[-1].split('.')[0]))
ResNet.sort_values('fnum', inplace = True)
ResNet.set_index('fnum', inplace = True)

# Manual implementation of Softmax
# Matrix of probabilities

# img num 1: [ P(class=1), P(class=2), ...]
# img num 2: [ P(class=1), P(class=2), ...]
probabilities = np.exp(ResNet.values)/np.exp(ResNet.values).sum(axis=1, keepdims=True)

In [2]:
from glob import glob
from functools import reduce
from collections import defaultdict, Counter

def counter_to_df_row(counter, index):
    if bool(counter):
        row = pd.DataFrame.from_dict(counter, orient='index').transpose()
        row.index = [index]
        return row
    else:
        return pd.DataFrame(index=[index])

def get_tags_from(fname):
    with open(fname) as f:
        fnum = int(fname.split('/')[-1].split('.')[0])
        tags = f.read().splitlines()
        categories = Counter([tag.split(':')[0] for tag in tags])
        subcategories = Counter([tag.split(':')[1] for tag in tags])
        
        cat_row = counter_to_df_row(categories, fnum)
        subcat_row = counter_to_df_row(subcategories, fnum)
        
    return cat_row, subcat_row
        

files = glob('./data/tags_train/*')
all_tags = [get_tags_from(file) for file in files]
cats, subcats = tuple(zip(*all_tags))

cats = reduce(lambda x, y: x.append(y), cats)
cats.fillna(0, inplace=True)
cats = cats.sort_index()

subcats = reduce(lambda x, y: x.append(y), subcats)
subcats.fillna(0, inplace=True)
subcats = subcats.sort_index()

In [4]:
descriptions = np.loadtxt(open("./pipeline/data/tokens_train.csv", "r"), delimiter=",", skiprows=1)
descriptions = descriptions[:,1:]

In [53]:
from sklearn.naive_bayes import MultinomialNB

def get_probability_matrix(descriptions_matrix, category_df):
    classifiers = list()
    category_probabilities = np.zeros((len(descriptions_matrix[0,:]),len(category_df.columns)))
    token_vectors = np.diag(np.ones(len(descriptions_matrix[0,:])))

    for i in range(len(category_df.columns)):
        col = category_df[category_df.columns[i]]
        flattened = np.array((col > 0).astype(int))
        classifiers.append(MultinomialNB().fit(descriptions_matrix, flattened))
        predictions = classifiers[i].predict_proba(token_vectors)
        category_probabilities[:,i] = predictions[:,1]
    
    return category_probabilities, classifiers

cat_probs, cat_clfs = get_probability_matrix(descriptions, cats)
subcat_probs, subcat_clfs = get_probability_matrix(descriptions, subcats)