In [4]:
import pandas as pd
import re

In [5]:
df_idf = pd.read_json("data/photo_youdo.json", lines=True)

In [6]:
df_idf.head(10)

Unnamed: 0,body
0,Wedding photographer\nNeed a photographer for ...
1,Treatment youtube video\nGood afternoon. It is...
2,Need a videographer\nTo remove the backstage p...
3,Looking for a photographer for family photogra...
4,Mount video 3.5 min\nTotal duration 3 min 30 s...
5,Subject photography\nNeed to sell photos for t...
6,Photography\nPhotographing models for my Insta...
7,Edit video\nYou need to remove the logo to the...
8,To make a video review\nYou need to make a vid...
9,Need to do a video\nTo film the video


## Text normalization

In [3]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt')

word_net_lemmatizer = WordNetLemmatizer()

# should be more sophisticated
def pre_process(text):
    # lowercase
    text = text.lower()
    
    #remove tags
    text = re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ",text)
    
    text_words = nltk.word_tokenize(text)
    lemmatized_text = ""
    for word in text_words:
        lemma = word_net_lemmatizer.lemmatize(word)
        lemmatized_text += " " + lemma
    text = lemmatized_text[1:] 
    
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alexander\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alexander\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
df_idf['text'] = df_idf['body']
df_idf['text'] = df_idf['text'].apply(lambda x: pre_process(x))

In [8]:
df_idf['text'][0]

'wedding photographer need a photographer for a wedding february from to or hour charge the bride check the south west of moscow'

# Stopwords

In [9]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))   

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexander\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

docs = df_idf['text'].tolist()
cv = CountVectorizer(max_df=0.85, stop_words=stop_words, max_features=10000)

word_count_vector = cv.fit_transform(docs)

In [11]:
word_count_vector.shape

(9000, 7138)

In [12]:
list(cv.vocabulary_.keys())[:10]

['wedding',
 'photographer',
 'need',
 'february',
 'hour',
 'charge',
 'bride',
 'check',
 'south',
 'west']

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)



TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [14]:
tfidf_transformer.transform(word_count_vector)

<9000x7138 sparse matrix of type '<class 'numpy.float64'>'
	with 91974 stored elements in Compressed Sparse Row format>

In [15]:
tfidf_vector_0 = tfidf_transformer.transform(cv.transform([docs[0]]))

In [16]:
features_vec = tfidf_transformer.transform(word_count_vector)

In [17]:
def sort_COO(COO_matrix):
    tuples = zip(COO_matrix.col, COO_matrix.data)
    return sorted(tuples, key=lambda x:(x[1], x[0]), reverse=True)

def topn(sorted_tuples, feature_names, topn):
    feature_list = []
    value_list = []
    sorted_tuples = sorted_tuples[:topn]
    for idx, value in sorted_tuples:
        feature_list.append(feature_names[idx])
        value_list.append(value)
    results = {}
    for idx in range(len(feature_list)):
        results[feature_list[idx]] = value_list[idx]
    return results

In [18]:
docs[0]

'wedding photographer need a photographer for a wedding february from to or hour charge the bride check the south west of moscow'

In [19]:
feature_names=cv.get_feature_names()
sorted_tuples = sort_COO(tfidf_vector_0.tocoo())
keywords = topn(sorted_tuples, feature_names, 10)

In [20]:
for k in keywords:
    print(k, keywords[k])

west 0.408918139910497
wedding 0.39445350703255283
charge 0.382671737159973
south 0.36147329378014714
check 0.31765348550026157
february 0.3009251614760709
bride 0.2997670593120834
moscow 0.2146019947571679
photographer 0.20567555262653536
hour 0.15589967180316164


In [34]:
tfidf_vector_1 = tfidf_transformer.transform(cv.transform([pre_process('wanna photos of pets dogs or cats')]))

In [22]:
cv.transform(['wanna photo family'])

<1x7138 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

cossim = cosine_similarity(tfidf_vector_0, tfidf_vector_1)

In [24]:
tfidf_vector_1

<1x7138 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [25]:
docs[3]

'looking for a photographer for family photography two adult and two child the ability to lure child in priority at the end of the week will be in moscow i would like to find a good family photographer for outdoor photo shoot'

In [35]:
import pysparnn.cluster_index as ci

cp = ci.MultiClusterIndex(features_vec, docs)


cp.search(tfidf_vector_1, k=20, k_clusters=2, return_distance=True)

  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)
  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)


[[('0.576839148229161', 'the photo of the dog to make beautiful photo of dog'),
  ('0.5823115366911087',
   'cat rental good day for the photo shoot in studio moscow required cat cat not too large in size to remove'),
  ('0.5864217057915952',
   'photography of interior with the cat there is a small exhibition of furniture room need to take photo with the cat or cat in these interior'),
  ('0.6015794410757642', 'photo shoot of dog on the street photoshoot dog'),
  ('0.6139851767848199',
   'photo shoot with a dog i have a dog i need a good staged photo of the dog and general perhaps the u will my'),
  ('0.6143858362647693',
   'to make the video the dog wa talking need to make a video where the dog is removed and another video with the cat so that they move their mouth'),
  ('0.6231851318489131',
   'dog photo need a photographer who can photograph dog need photo dog for pristroystvo better on his car'),
  ('0.6592526530422305',
   'rent dog for video shooting you need dog breed small 