In [1]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from nltk.stem.snowball import *
from sklearn import metrics
from fuzzywuzzy import fuzz
import numpy as np
import cyrtranslit
import operator
import sys
import re



In [2]:
def removePunctuation(text):
    return re.sub(ur'[^a-zA-Zа-яА-Я ]', '', text.lower()).strip()

In [3]:
def translit(s):
    try:
        return cyrtranslit.to_cyrillic(s, "ru").decode('utf-8')
    except: return s

In [None]:
def get_closest_words(query, sorted_f):
    res = []
    for j in query:
        for word in sorted_f:
            if fuzz.ratio(j, word[0])>78: 
                res.append(word[0])
                break
    return res

In [4]:
import csv
with open('queries.csv', 'r') as f:
    data = f.readlines()
data = data[1:]
data = [line.split(',') for line in data]
data = [[x[0], int(x[1])] for x in data]
y = [x[1] for x in data]
queries = [x for x in data]
queries = [removePunctuation(x[0].decode('utf-8')) for x in queries]
data = [x.replace('+', ' ') for x in queries]

In [5]:
stemmer = RussianStemmer(ignore_stopwords=True)
new_data = []
for i in data:
    new_data.append([translit(stemmer.stem(word)) for word in i.split(' ')])

In [6]:
unique_words = list(set([item for sublist in new_data for item in sublist]))
all_words = [item for sublist in new_data for item in sublist]
frequency = {x: all_words.count(x)/float(len(unique_words)) for x in unique_words}
sorted_frequency = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)

In [8]:
f_data = [get_closest_words(query,sorted_frequency) for query in new_data]

In [9]:
ready = [' '.join(q) for q in f_data if len(q)>0]

In [10]:
vectorizer = TfidfVectorizer(max_df=0.03,max_features = len(unique_words),
                                 min_df=2,use_idf=True,strip_accents='unicode')
X = vectorizer.fit_transform(ready)


print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 3199, n_features: 426


In [11]:
db = DBSCAN(eps=0.75, min_samples=2, metric='euclidean', algorithm='auto').fit(X)

In [12]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
print('Real number of clusters: %d' % len(list(set(y))))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels))
print("Completeness: %0.3f" % metrics.completeness_score(y, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))


Estimated number of clusters: 315
Real number of clusters: 180
Homogeneity: 0.908
Completeness: 0.863
V-measure: 0.885


In [13]:
count = 0
for i in labels:
    if i==-1: count += 1
print 'noise: ',count
print
for i in range(len(labels)):
    if labels[i] == 5:
        print ''.join(data[i])

noise:  95

loreal telescopic тушь
лореаль тушь телескопик
тушь телескопик цена
тушь телескопик купить
купить тушь лореаль телескопик
тушь лореаль телескопик купить
купить тушь лореаль телескопик украина
тушь лореаль телескопик цена
тушь телескопик
лореаль телескопик тушь
тушь loreal telescopic
telescopic тушь
телескопик тушь
тушь телескопик от лореаль
тушь для ресниц лореаль телескопик
тушь лореаль телескопик
тушь telescopic
телескопик тушь цена
тушь
тушь    цена
тушь    отзывы
