In [119]:
import matplotlib.pyplot as plt
import numpy as np

from skimage.feature import hog
from skimage import data, exposure
from skimage.color import rgb2gray
from skimage.io import imread
from skimage.transform import resize

from keras.preprocessing import image

import pandas as pd

# Hide all warnings
import warnings
warnings.filterwarnings('ignore')

import nltk

df = pd.read_csv("../visualstories_edfest_2016_twitter_xmedia.csv", sep=';', encoding="utf-8")

data = np.array([df.get("text").values, df.get("image-url").values])
tweets = data[0]
images = data[1]

# Bad resulsts
#from nltk.tokenize import TreebankWordTokenizer
#tknzr = TreebankWordTokenizer()

# Good results but doesn't have method to remove links
#from nltk.tokenize import TweetTokenizer
#tknzr = TweetTokenizer(strip_handles=True, preserve_case=False)

# Very Good Resulsts, probably because urls and handles are removed
from tokenizer import tokenizer
tknzr = tokenizer.TweetTokenizer(preserve_handles=False, preserve_case=False, preserve_url=False)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words="english", min_df=3, binary=False, tokenizer=tknzr.tokenize)
texts_bow = vectorizer.fit_transform(tweets)
vocabulary = vectorizer.vocabulary_
print("Vocabulary size: {}".format(len(vocabulary)))

from sklearn.metrics import pairwise_distances

def k_neighbours(q, X, metric="euclidean", k=10):
    # Check pairwise_distances function docs: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html#sklearn.metrics.pairwise_distances
    dists = pairwise_distances(q, X, metric=metric)
    
    # Dists gets a shape 1 x NumDocs. Convert it to shape NumDocs (i.e. drop the first dimension)
    dists = np.squeeze(dists)
    sorted_indexes = np.argsort(dists)
    
    return sorted_indexes[:k], dists[sorted_indexes[:k]]

from sklearn.preprocessing import normalize
texts_bow = normalize(texts_bow, norm="l2")

query1 = "Street circus a popular attractions at Edfest attracting several artists such as unicycle juggling."
query=query1

# Transform query in a BoW representation
query_bow = vectorizer.transform([query])
query_bow = normalize(query_bow, norm="l2")

k_nearest_indexes, k_nearest_dists = k_neighbours(q=query_bow, X=texts_bow, metric="cosine", k=10)

list(zip(k_nearest_indexes,k_nearest_dists, [tweets[i] for i in k_nearest_indexes],[images[i] for i in k_nearest_indexes]))

Vocabulary size: 1008


[(471,
  0.43305329048615904,
  '#edfringe 2016 Utilising the street. Mime artist. pic.twitter.com/tG00OdQXGv',
  'https://pbs.twimg.com/media/CqVDTy8WgAAIVnQ.jpg'),
 (707,
  0.5,
  'My wanderings took me to Circus Lane today.  #edinphoto #edfringe pic.twitter.com/R0IqB0RD4q',
  'https://pbs.twimg.com/media/CpsHUAPWAAA72xw.jpg'),
 (473,
  0.525658350974743,
  '#edfringe 2016 A mime artist working the street. Talented guy. pic.twitter.com/s7hBCdq7kt',
  'https://pbs.twimg.com/media/CqU749nXEAIr2YU.jpg'),
 (1363,
  0.525658350974743,
  '@BryanCJewett awesome set for EDFest man. Enjoyed every minute.  pic.twitter.com/oQVVd7csji',
  'https://pbs.twimg.com/media/CzTVr3KWEAEyBk2.jpg'),
 (609,
  0.525658350974743,
  'Brass Monkey. Booze. On beds. #edfringe pic.twitter.com/wCkoJ73Fpi',
  'https://pbs.twimg.com/media/CpbfJrzWcAE8kxe.jpg'),
 (67,
  0.5477329831333546,
  'Greasiest greasy spoon ever. Literally dripping. Noms. #edfringe pic.twitter.com/c6afWwlSUd',
  'https://pbs.twimg.com/media/C