In [1]:
import pandas as pd
import numpy as np

ResNet = pd.read_csv('./data/features_train/features_resnet1000_train.csv', header=None)
ResNet.columns = ['fnum'] + list(range(1000))
ResNet['fnum'] = ResNet['fnum'].apply(lambda x: int(x.split('/')[-1].split('.')[0]))
ResNet.sort_values('fnum', inplace = True)
ResNet.set_index('fnum', inplace = True)

# Manual implementation of Softmax
# Matrix of probabilities

# img num 1: [ P(class=1), P(class=2), ...]
# img num 2: [ P(class=1), P(class=2), ...]
probabilities = np.exp(ResNet.values)/np.exp(ResNet.values).sum(axis=1, keepdims=True)

In [2]:
from glob import glob
from functools import reduce
from collections import defaultdict, Counter

def counter_to_df_row(counter, index):
    if bool(counter):
        row = pd.DataFrame.from_dict(counter, orient='index').transpose()
        row.index = [index]
        return row
    else:
        return pd.DataFrame(index=[index])

def get_tags_from(fname):
    with open(fname) as f:
        fnum = int(fname.split('/')[-1].split('.')[0])
        tags = f.read().splitlines()
        categories = Counter([tag.split(':')[0] for tag in tags])
        subcategories = Counter([tag.split(':')[1] for tag in tags])
        
        cat_row = counter_to_df_row(categories, fnum)
        subcat_row = counter_to_df_row(subcategories, fnum)
        
    return cat_row, subcat_row
        

files = glob('./data/tags_train/*')
all_tags = [get_tags_from(file) for file in files]
cats, subcats = tuple(zip(*all_tags))

cats = reduce(lambda x, y: x.append(y), cats)
cats.fillna(0, inplace=True)
cats = cats.sort_index()

subcats = reduce(lambda x, y: x.append(y), subcats)
subcats.fillna(0, inplace=True)
subcats = subcats.sort_index()

In [8]:
descriptions = pd.read_csv("./pipeline/data/tokens_train.csv")
descriptions = descriptions.set_index('Unnamed: 0')

In [9]:
descriptions_test = pd.read_csv("./pipeline/data/tokens_test.csv")
descriptions_test = descriptions_test.set_index('Unnamed: 0')

Unnamed: 0_level_0,venue,neon,rug,scatter,taster,con,darkly,remote,crab,buffalo,...,batter,reachable,wed,counter,wing,umbella,carouse,danger,merry,rock
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Convert into vector of tokens that map to our training data[]
a = Counter(list(descriptions_test.columns) + list(descriptions.columns))
[  ]
# len(set(descriptions.columns) - set(descriptions_test.columns))

Counter({'venue': 2,
         'neon': 2,
         'rug': 2,
         'scatter': 2,
         'taster': 1,
         'con': 2,
         'darkly': 2,
         'remote': 2,
         'crab': 2,
         'buffalo': 2,
         'surfer': 2,
         'meat': 2,
         'appear': 2,
         'vertical': 2,
         'care': 2,
         'color': 2,
         'reservoir': 1,
         'compact': 2,
         'medal': 2,
         'string': 2,
         'tiny': 2,
         'gloomy': 1,
         'badminton': 2,
         'casual': 2,
         'dup': 2,
         'sill': 2,
         'shiny': 2,
         'try': 2,
         'guard': 2,
         'unable': 2,
         'swing': 2,
         'late': 2,
         'box': 2,
         'pizza': 2,
         'blossom': 2,
         'money': 2,
         'cardboard': 2,
         'owner': 2,
         'promote': 2,
         'stonework': 1,
         'stroll': 2,
         'piano': 2,
         'front': 2,
         'ur': 2,
         'male': 2,
         'device': 2,
         'breed

In [4]:
from sklearn.naive_bayes import MultinomialNB

def get_probability_matrix(descriptions_matrix, category_df):
    classifiers = list()
    category_probabilities = np.zeros((len(descriptions_matrix[0,:]),len(category_df.columns)))
    token_vectors = np.diag(np.ones(len(descriptions_matrix[0,:])))

    for i in range(len(category_df.columns)):
        col = category_df[category_df.columns[i]]
        flattened = np.array((col > 0).astype(int))
        classifiers.append(MultinomialNB().fit(descriptions_matrix, flattened))
        predictions = classifiers[i].predict_proba(token_vectors)
        category_probabilities[:,i] = predictions[:,1]
    
    return category_probabilities, classifiers

cat_probs, cat_clfs = get_probability_matrix(descriptions, cats)
subcat_probs, subcat_clfs = get_probability_matrix(descriptions, subcats)