In [10]:
from glob import glob
from functools import reduce
from collections import defaultdict, Counter
from sklearn.preprocessing import normalize
import pandas as pd

def counter_to_df_row(counter, index):
    if bool(counter):
        row = pd.DataFrame.from_dict(counter, orient='index').transpose()
        row.index = [index]
        return row
    else:
        return pd.DataFrame(index=[index])

def get_tags_from(fname):
    with open(fname) as f:
        fnum = int(fname.split('/')[-1].split('.')[0])
        tags = f.read().splitlines()
        categories = Counter([tag.split(':')[0] for tag in tags])
        subcategories = Counter([tag.split(':')[1] for tag in tags])
        
        cat_row = counter_to_df_row(categories, fnum)
        subcat_row = counter_to_df_row(subcategories, fnum)
        
    return cat_row, subcat_row
        
# First get tag vector data

files = glob('./data/tags_test/*')
all_tags = [get_tags_from(file) for file in files]
cats, subcats = tuple(zip(*all_tags))

cats = reduce(lambda x, y: x.append(y), cats)
cats.fillna(0, inplace=True)
cats = cats.sort_index()

subcats = reduce(lambda x, y: x.append(y), subcats)
subcats.fillna(0, inplace=True)
subcats = subcats.sort_index()

In [13]:
tags = list(cats.columns) + list(subcats.columns)

In [141]:
from sklearn.preprocessing import normalize
import numpy as np 

test_tag_matrix = np.zeros((2000,91))
files = glob('./data/descriptions_test/*')
for fname in files:
    fnum = int(fname.split('/')[-1].split('.')[0])
    with open(fname,'r') as f:
        text = f.read()
        test_tag_matrix[fnum,:] = np.array([tag in text for tag in tags]).astype(int)

test_tag_matrix = normalize(test_tag_matrix,axis=1)


In [142]:
def reverse(matrix):
    print("Reversing matrix ")
    A = np.zeros(matrix.shape)
    for i in range(len(matrix[:,0])):
        row_order_indices = matrix[i].argsort() 
        
        A[i] = row_order_indices[::-1]
        print(A[i])
    return A

def similarity_to_results(similarities_matrix, fname):
    sort_indices = reverse(similarities_matrix)

    results = {}
    for i in range(2000): # Each test
        top_20 = [
            str(np.where(sort_indices[i,:] == j)[0][0]) for j in range(1,21)]
        results[str(i)+'.txt'] = '.jpg '.join(top_20) + '.jpg'

    # Create Submission
    results = pd.DataFrame.from_dict(results, orient='index').reset_index()
    results.columns = ['Descritpion_ID','Top_20_Image_IDs']
    results.to_csv(fname, index=False,index_label=False)

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

#similarities = cosine_similarity(test_tag_matrix,test_tag_matrix)
#similarity_to_results(similarities, 'asdf.csv')

In [164]:
ResNet = dict()
for ty in ['test', 'train']:
    ResNet[ty] = pd.read_csv('./data/features_'+ty+'/features_resnet1000_'+ty+'.csv', header=None)
    ResNet[ty].columns = ['fnum'] + list(range(1000))
    ResNet[ty]['fnum'] = ResNet[ty]['fnum'].apply(lambda x: int(x.split('/')[-1].split('.')[0]))
    ResNet[ty].sort_values('fnum', inplace = True)
    ResNet[ty].set_index('fnum', inplace = True)


In [165]:
from sklearn.metrics.pairwise import euclidean_distances
similarities = euclidean_distances(ResNet['test'],ResNet['train'])

In [166]:
similarities

array([[  75.04344853,   94.1626282 ,   71.33721759, ...,   50.51155197,
          85.01679191,   94.45533879],
       [  77.45618823,   95.86880389,   74.72753812, ...,   87.09991256,
          93.79964856,   69.92481871],
       [  66.52308561,   86.16232339,   76.55427887, ...,   61.88877713,
          95.98391486,   97.90111006],
       ..., 
       [  81.52096313,  112.42863549,   75.42993525, ...,   91.17031008,
          76.36326334,   76.93333049],
       [  66.82019831,   97.68562696,   63.69096466, ...,   87.4590323 ,
          79.41847042,   58.55328624],
       [ 111.04941265,   60.98849448,  107.35137114, ...,   99.56504761,
         121.57378782,  114.41006284]])

In [167]:
token_vectors = pd.read_csv('tokens.csv')
token_vectors.set_index('Unnamed: 0', inplace=True)
test_token_vectors = pd.read_csv("./tokens_test.csv")
test_token_vectors.set_index('Unnamed: 0', inplace=True)

argsort_similarities = similarities.argsort()



In [168]:
first_elements = [i[0] for i in argsort_similarities]
first_elements = np.asarray(first_elements).astype(int)
first_elements

array([ 962, 6673, 4401, ..., 2852, 8491,  333])

In [169]:
test_train1N_matrix = token_vectors.iloc[first_elements]
test_train1N_matrix

Unnamed: 0_level_0,occupancy,normal,june,slump,memorial,sturdy,demolition,fogy,return,horrible,...,bedside,contemplate,higher,peep,onward,pretty,hopi,iron,mark,black
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
def cols_standard(train_vectors, test_vectors):
    diff_cols = [col for col in train_vectors.columns if col not in test_vectors.columns]
    for col in diff_cols:
        test_vectors[col] = 0

    test_vectors = test_vectors[train_vectors.columns]
    return test_vectors


#test_vector_similarities = cosine_similarity(test_token_vectors,test_train1N_matrix)
test_token_vectors = cols_standard(test_train1N_matrix,test_token_vectors)
test_token_vectors
test_train1N_matrix

Unnamed: 0_level_0,occupancy,normal,june,slump,memorial,sturdy,demolition,fogy,return,horrible,...,bedside,contemplate,higher,peep,onward,pretty,hopi,iron,mark,black
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [162]:
from sklearn.metrics.pairwise import euclidean_distances
test_vector_similarities = euclidean_distances(test_train1N_matrix,test_token_vectors)


similarity_to_results(test_vector_similarities,"basetest_euclidean.csv")


Reversing matrix 
[  483.  1289.   219. ...,  1255.  1257.     0.]
[  180.  1596.  1399. ...,  1127.  1128.     0.]
[ 1481.   979.   341. ...,   517.   516.  1401.]
[ 1128.  1611.   375. ...,  1101.  1104.     0.]
[  540.   277.   414. ...,  1289.   597.  1999.]
[  633.  1656.  1499. ...,  1150.  1153.     0.]
[   26.   768.  1531. ...,  1164.  1167.     0.]
[  964.  1473.   100. ...,   897.  1624.   999.]
[ 1926.   794.  1379. ...,  1286.  1287.  1999.]
[  445.   660.  1149. ...,   681.  1393.   999.]
[  984.   889.  1816. ...,  1041.  1038.     0.]
[ 1260.    81.  1037. ...,  1363.  1369.     0.]
[ 1835.  1843.  1512. ...,  1233.  1234.   999.]
[  120.   205.  1764. ...,  1741.   364.  1154.]
[  828.  1157.  1371. ...,  1211.   552.     0.]
[ 1837.  1085.  1664. ...,  1071.  1072.   999.]
[ 1061.  1188.   126. ...,  1147.  1149.     0.]
[ 1679.   300.  1068. ...,  1452.   764.  1569.]
[  540.  1933.   660. ...,   614.   613.  1548.]
[  236.   476.  1496. ...,   462.   903.     0.]
[ 

[ 1264.   984.   558. ...,   686.   682.   999.]
[  254.   117.   201. ...,  1132.  1133.     0.]
[  531.  1583.  1655. ...,   739.   738.   999.]
[  665.  1760.  1156. ...,  1122.  1123.     0.]
[  430.  1381.   585. ...,  1060.  1059.     0.]
[  599.     5.  1087. ...,  1452.   660.   999.]
[ 1332.  1136.  1749. ...,  1007.  1008.   999.]
[ 1148.  1637.   696. ...,  1094.  1095.     0.]
[ 1163.   474.   695. ...,  1229.   491.     0.]
[  683.  1765.   439. ...,   805.   803.  1444.]
[ 1146.  1230.   430. ...,  1341.   601.     0.]
[  518.  1909.  1821. ...,  1405.   696.     0.]
[ 1140.  1914.   201. ...,   619.   618.     0.]
[  760.   374.   477. ...,  1706.  1426.  1288.]
[ 1554.   976.  1062. ...,  1126.  1127.   999.]
[ 1062.  1554.  1896. ...,  1288.   535.   999.]
[  752.  1464.   740. ...,   644.  1038.  1402.]
[  600.  1349.  1397. ...,   763.   761.     0.]
[ 1272.  1778.  1637. ...,  1144.  1146.     0.]
[  447.  1943.  1859. ...,  1203.  1205.   999.]
[ 1912.   135.  1919

[  333.  1832.   495. ...,  1121.  1123.     0.]
[ 1028.   648.    93. ...,  1461.   641.   546.]
[  616.  1667.   966. ...,  1660.   926.   587.]
[  955.   677.   303. ...,  1560.   837.     0.]
[  290.  1024.  1628. ...,  1052.  1053.   999.]
[ 1681.  1967.   566. ...,  1199.   821.   720.]
[  206.  1485.  1902. ...,  1162.  1167.     0.]
[  4.87000000e+02   4.97000000e+02   1.00000000e+00 ...,   1.24300000e+03
   5.12000000e+02   0.00000000e+00]
[  117.  1972.    23. ...,  1275.  1280.     0.]
[ 1694.   844.   728. ...,   539.   537.   751.]
[ 1922.  1160.  1436. ...,  1831.  1070.   999.]
[ 1786.   538.    85. ...,   976.  1824.   999.]
[   96.    48.  1372. ...,  1420.   692.  1999.]
[ 1815.   826.  1389. ...,  1754.   817.   999.]
[ 1802.   552.    95. ...,  1364.   640.   999.]
[ 1984.   682.  1512. ...,  1850.   623.   999.]
[ 1333.   280.  1016. ...,   573.  1386.   999.]
[ 1085.   685.   240. ...,  1480.  1482.   512.]
[    3.   374.   434. ...,  1129.  1130.   999.]
[  981. 

[ 1624.  1383.  1598. ...,  1504.   659.   999.]
[ 1083.  1282.  1693. ...,  1026.  1024.   472.]
[  570.  1938.  1876. ...,   807.   798.     0.]
[ 1689.  1597.  1572. ...,   651.  1630.   999.]
[ 1681.  1967.   566. ...,  1199.   821.   720.]
[  151.  1512.   627. ...,  1306.   542.     0.]
[ 1583.  1777.  1129. ...,  1352.   638.     0.]
[  414.   277.   652. ...,  1074.  1076.     0.]
[  820.  1290.   822. ...,  1866.   640.   999.]
[ 1918.  1834.   282. ...,  1869.   847.     0.]
[ 1630.   873.  1260. ...,  1560.   516.  1555.]
[ 1042.   118.  1749. ...,  1636.   710.   999.]
[  139.  1473.   944. ...,  1177.  1179.   999.]
[  859.  1035.   259. ...,  1201.   433.  1022.]
[  221.  1955.    39. ...,   662.  1529.   999.]
[  168.  1252.  1225. ...,  1194.  1195.     0.]
[  506.   812.   492. ...,  1337.  1339.     0.]
[  652.   755.  1862. ...,  1130.  1132.     0.]
[  702.  1421.  1807. ...,  1239.  1241.   999.]
[ 1297.   687.   873. ...,   996.   991.     0.]
[  65.  275.  410. .

[ 287.  412.  737. ...,  338.  339.  454.]
[ 1817.  1654.  1792. ...,  1282.  1286.   999.]
[  237.  1426.   563. ...,  1371.   508.   845.]
[ 1082.   966.  1959. ...,  1190.  1191.     0.]
[  721.   626.  1512. ...,   603.   601.  1567.]
[ 1808.    36.   859. ...,  1303.  1806.  1528.]
[ 1806.  1019.   895. ...,   657.   656.   733.]
[ 1102.  1349.   600. ...,   497.   496.  1385.]
[  341.    40.   116. ...,  1197.  1199.  1331.]
[ 1426.  1678.   509. ...,   985.   984.     0.]
[ 1349.   500.   689. ...,   501.  1235.   999.]
[ 1058.  1484.  1111. ...,  1655.  1659.     0.]
[  666.  1443.   835. ...,   449.   448.     0.]
[  231.   125.  1720. ...,  1466.   755.   592.]
[ 1003.   295.    86. ...,  1364.   365.   698.]
[ 1543.   479.   189. ...,   996.   991.   999.]
[ 1507.    81.   517. ...,  1035.  1034.     0.]
[ 1676.  1739.    20. ...,  1148.  1150.   999.]
[ 1532.  1664.  1830. ...,  1549.   757.   631.]
[ 1408.   690.  1702. ...,   398.   399.     0.]
[ 868.  329.  435. ...,  7

[ 1967.  1263.  1938. ...,  1013.  1012.     0.]
[  430.   585.  1381. ...,  1636.  1040.  1682.]
[  277.  1149.  1129. ...,   605.   604.     0.]
[  509.  1901.  1006. ...,  1212.  1213.   999.]
[  826.   124.    80. ...,  1090.  1091.     0.]
[ 1058.  1484.  1366. ...,  1702.   544.  1306.]
[   96.    48.  1372. ...,  1420.   692.  1999.]
[ 984.  120.  193. ...,  482.  872.  999.]
[ 1376.  1349.  1687. ...,   795.   792.     0.]
[  917.   675.   505. ...,   522.  1296.  1412.]
[  271.    26.   627. ...,  1095.  1096.     0.]
[  387.  1298.   923. ...,   869.   870.   999.]
[ 1570.  1876.   758. ...,  1152.  1153.     0.]
[ 1996.   146.   245. ...,   647.  1390.   999.]
[  137.   955.   437. ...,  1301.   476.   999.]
[ 1872.  1972.   117. ...,  1118.  1119.     0.]
[  176.   396.   512. ...,   414.  1518.   656.]
[ 1487.  1688.  1901. ...,  1167.  1169.     0.]
[  631.  1336.  1544. ...,  1146.  1149.   999.]
[ 1087.   128.  1358. ...,   581.   579.   999.]
[  884.  1730.  1600. ...,