## Text normalization

In [37]:
import nltk
import re
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt')

word_net_lemmatizer = WordNetLemmatizer()

# should be more sophisticated
def pre_process(text):
    # lowercase
    text = text.lower()
    
    #remove tags
    text = re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ",text)
    
    text_words = nltk.word_tokenize(text)
    lemmatized_text = ""
    for word in text_words:
        lemma = word_net_lemmatizer.lemmatize(word)
        lemmatized_text += " " + lemma
    text = lemmatized_text[1:] 
    
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alexander\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alexander\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Database creation/preprocessing

In [38]:
import pandas as pd

tasks_path = 'data/photo_youdo.json'
df = pd.read_json(tasks_path, lines=True)
df.rename(columns={'body': 'text'}, inplace=True)
df['normalized'] = df['text'].apply(lambda x: pre_process(x))

In [39]:
df.head(10)

Unnamed: 0,text,normalized
0,Wedding photographer\nNeed a photographer for ...,wedding photographer need a photographer for a...
1,Treatment youtube video\nGood afternoon. It is...,treatment youtube video good afternoon it is n...
2,Need a videographer\nTo remove the backstage p...,need a videographer to remove the backstage ph...
3,Looking for a photographer for family photogra...,looking for a photographer for family photogra...
4,Mount video 3.5 min\nTotal duration 3 min 30 s...,mount video min total duration min sec there a...
5,Subject photography\nNeed to sell photos for t...,subject photography need to sell photo for the...
6,Photography\nPhotographing models for my Insta...,photography photographing model for my instagr...
7,Edit video\nYou need to remove the logo to the...,edit video you need to remove the logo to the ...
8,To make a video review\nYou need to make a vid...,to make a video review you need to make a vide...
9,Need to do a video\nTo film the video,need to do a video to film the video


# Create TF-IDF based index + fetch tasks

In [40]:
import pysparnn.cluster_index as ci
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


class TasksIndex():
    def __init__(self, df):
        self.df = df
        
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        
        self.cv = CountVectorizer(max_df=0.85, stop_words=stop_words, max_features=10000)
        docs = self.df['normalized'].tolist()
        word_count_vector = self.cv.fit_transform(docs)
    
        self.tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
        self.tfidf_transformer.fit(word_count_vector)
        features_vec = self.tfidf_transformer.transform(word_count_vector)
        
        self.index = ci.MultiClusterIndex(features_vec, list(range(len(docs))))
        
    def get_similar_tasks(self, message, k_tasks=10):
        message = pre_process(message)
        ftrs_vec = self.tfidf_transformer.transform(self.cv.transform([message]))
        tasks = self.index.search(ftrs_vec, k=k_tasks, k_clusters=2, return_distance=0)[0]
        tasks = [int(t) for t in tasks]
    
        tasks = self.df.iloc[tasks]['text'].tolist()
    
        return tasks

In [41]:
# example
index = TasksIndex(df)
for t in index.get_similar_tasks('Help me! I love pets and I want to photo some dogs or cats'):
    print(t)
    print()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexander\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The photo of the dog
To make beautiful photos of dogs

Cat rental
Good day! For the photo shoot in-Studio ( Moscow) required cat/cat not too large in size. To remove...

Photography of interiors with the cat
There is a small exhibition of furniture, 9 rooms. Need to take photos with the cat (or cats) in these interiors...

Photo shoot of dogs on the street
Photoshoot dogs

Photo shoot with a dog
I have a dog, I need a good staged photo of the dog and General. Perhaps the us will my...

To make the video the dog was talking
Need to make a video where the dog is removed and another video with the cat. ... so that they Move their mouth...

Dog photos
Need a photographer who can photograph dogs. Need photo 7 dogs for pristroystvo. Better on his car...

Rent dogs for video shooting
You need 2 dogs 1 breed Small and big dog to shoot for a blogger in instagram

Need a chubby cat for a couple of photos
I(we) the fat cat;) photo needed for the infopovod

Photography
Want a photo session love st

  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)
  magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square)


# Other features

In [16]:
def sort_COO(COO_matrix):
    tuples = zip(COO_matrix.col, COO_matrix.data)
    return sorted(tuples, key=lambda x:(x[1], x[0]), reverse=True)

def topn(sorted_tuples, feature_names, topn):
    feature_list = []
    value_list = []
    sorted_tuples = sorted_tuples[:topn]
    for idx, value in sorted_tuples:
        feature_list.append(feature_names[idx])
        value_list.append(value)
    results = {}
    for idx in range(len(feature_list)):
        results[feature_list[idx]] = value_list[idx]
    return results

In [17]:
docs[0]

'wedding photographer need a photographer for a wedding february from to or hour charge the bride check the south west of moscow'

In [18]:
feature_names=cv.get_feature_names()
sorted_tuples = sort_COO(tfidf_vector_0.tocoo())
keywords = topn(sorted_tuples, feature_names, 10)

In [19]:
for k in keywords:
    print(k, keywords[k])

west 0.408918139910497
wedding 0.39445350703255283
charge 0.382671737159973
south 0.36147329378014714
check 0.31765348550026157
february 0.3009251614760709
bride 0.2997670593120834
moscow 0.2146019947571679
photographer 0.20567555262653536
hour 0.15589967180316164
