In [2]:
import pandas as pd
import re

In [3]:
df_idf = pd.read_json("data/photo_youdo.json", lines=True)

In [6]:
df_idf.head(10)

Unnamed: 0,body
0,Wedding photographer\nNeed a photographer for ...
1,Treatment youtube video\nGood afternoon. It is...
2,Need a videographer\nTo remove the backstage p...
3,Looking for a photographer for family photogra...
4,Mount video 3.5 min\nTotal duration 3 min 30 s...
5,Subject photography\nNeed to sell photos for t...
6,Photography\nPhotographing models for my Insta...
7,Edit video\nYou need to remove the logo to the...
8,To make a video review\nYou need to make a vid...
9,Need to do a video\nTo film the video


In [8]:
# should be more sophisticated
def pre_process(text):
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [9]:
df_idf['text'] = df_idf['body']
df_idf['text'] = df_idf['text'].apply(lambda x: pre_process(x))

In [12]:
df_idf['text'][0]

'wedding photographer need a photographer for a wedding february from to or hours charges the bride check the south west of moscow '

In [13]:
def get_stopwords(stop_words_file):
    with open(stop_words_file, 'r', encoding='utf-8') as f:
        stop_words = f.readlines()
        stop_set = (m.strip() for m in stop_words)
        return frozenset(stop_set)
    
stopwords = get_stopwords("data/stopwords/stopwords.txt")    

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

docs = df_idf['text'].tolist()
cv = CountVectorizer(max_df=0.85, stop_words=stopwords, max_features=10000)

word_count_vector = cv.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [16]:
word_count_vector.shape

(9000, 7895)

In [17]:
list(cv.vocabulary_.keys())[:10]

['wedding',
 'photographer',
 'need',
 'february',
 'hours',
 'charges',
 'bride',
 'check',
 'south',
 'west']

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)



TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [49]:
tfidf_transformer.transform(word_count_vector)

<9000x7895 sparse matrix of type '<class 'numpy.float64'>'
	with 88441 stored elements in Compressed Sparse Row format>

In [20]:
tfidf_vector_0 = tfidf_transformer.transform(cv.transform([docs[0]]))

In [50]:
features_vec = tfidf_transformer.transform(word_count_vector)

In [22]:
def sort_COO(COO_matrix):
    tuples = zip(COO_matrix.col, COO_matrix.data)
    return sorted(tuples, key=lambda x:(x[1], x[0]), reverse=True)

def topn(sorted_tuples, feature_names, topn):
    feature_list = []
    value_list = []
    sorted_tuples = sorted_tuples[:topn]
    for idx, value in sorted_tuples:
        feature_list.append(feature_names[idx])
        value_list.append(value)
    results = {}
    for idx in range(len(feature_list)):
        results[feature_list[idx]] = value_list[idx]
    return results

In [23]:
docs[0]

'wedding photographer need a photographer for a wedding february from to or hours charges the bride check the south west of moscow '

In [24]:
feature_names=cv.get_feature_names()
sorted_tuples = sort_COO(tfidf_vector_0.tocoo())
keywords = topn(sorted_tuples, feature_names, 10)

In [25]:
for k in keywords:
    print(k, keywords[k])

west 0.4042789949989151
charges 0.393932125493509
wedding 0.3912576182518272
south 0.35737240700637873
check 0.31404973108812584
february 0.2975111886160402
bride 0.2975111886160402
moscow 0.2121673515979993
photographer 0.20474498442274677
hours 0.17884618665707813


In [66]:
tfidf_vector_1 = tfidf_transformer.transform(cv.transform(['wanna photo pets dogs or cats']))

In [60]:
cv.transform(['wanna photo family'])

<1x7895 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [40]:
from sklearn.metrics.pairwise import cosine_similarity

cossim = cosine_similarity(tfidf_vector_0, tfidf_vector_1)

In [54]:
tfidf_vector_1

<1x7895 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [55]:
docs[3]

'looking for a photographer for family photography two adults and two children the ability to lure children in priority at the end of the week will be in moscow i would like to find a good family photographer for outdoor photo shoot '

In [67]:
import pysparnn.cluster_index as ci

cp = ci.MultiClusterIndex(features_vec, docs)


cp.search(tfidf_vector_1, k=20, k_clusters=2, return_distance=True)

  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)
  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)


[[('0.5277288258977926', 'photo shoot of dogs on the street photoshoot dogs'),
  ('0.630075843587516',
   'dog photos need a photographer who can photograph dogs need photo dogs for pristroystvo better on his car '),
  ('0.6564190536454169',
   'the photo of the dog to make beautiful photos of dogs'),
  ('0.6801762772865048',
   'rent dogs for video shooting you need dogs breed small and big dog to shoot for a blogger in instagram'),
  ('0.6940001958555815',
   'need photographer for a photoshoot of dogs need a photographer for a photo shoot puppies just dogs standing on the table changes require photography for advertising '),
  ('0.7308651857159978',
   'you photoshop the picture there is a picture where you need to replace all of the face or faces of cats'),
  ('0.7383320005284966',
   'installation video collect clips funny or cute how to play pets no longer than seconds '),
  ('0.7383763769613988',
   'to shoot and mount the video to shoot event with dogs to mount the roller for a