In [1]:
import pandas as pd
import urllib
import json
from pprint import pprint
import unidecode

### Чтение csv

In [2]:
df_q = pd.read_csv("KR\\KR\\qid.csv", names=["ID", "query"])
df_u = pd.read_csv("KR\\KR\\urlid.csv", names=["ID", "URL"], quotechar='"')

In [3]:
df_u = df_u.transform({ "ID": (lambda x: x), "URL": (lambda x: "https://ru.wikipedia.org" + x)})


### Чтение обхода из файла 

In [4]:
with open("ru_wikipedia\\full_out.json") as f:
    documents_json = json.load(f)

In [5]:
ttt = df_u.loc[df_u['URL'] == df_u["URL"][1413]]["ID"]
print ttt.values

print df_u["URL"][1413]


[1413]
https://ru.wikipedia.org/wiki/%D0%A0%D1%83%D1%81%D1%8C_(%D0%B3%D0%BE%D1%81%D1%83%D0%B4%D0%B0%D1%80%D1%81%D1%82%D0%B2%D0%BE)


### Утилиты 

In [6]:
class NormalizedDocument(object):
    def __init__(self, doc_id, url, text, text_normed, list_order):
        self.id = doc_id
        self.url = url
        self.text = text
        self.text_normed = text_normed
        self.list_order = list_order
    
    def __unicode__(self):
        return u"doc: id={}, url={}, snippet={}".format(self.id, self.url, self.text)
    
    def __repr__(self):
        return self.__unicode__().encode('utf-8')
    
    def document_normed_length(self):
        return len(self.text_normed)

In [7]:
class NormalizedQuery(object):
    def __init__(self, query_id, query, query_normed):
        self.id = query_id
        self.query = query
        self.query_normed = query_normed
        
    def __unicode__(self):
        return u"query: id={}, text={}".format(self.id, self.query) 
    
    def __repr__(self):
        return self.__unicode__().encode('utf-8')

In [8]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import RussianStemmer
from nltk.corpus import stopwords
import string

stopwords = stopwords.words('russian') + list(string.punctuation) + [u"википедия"]
lemmatizer = WordNetLemmatizer()
stemmer = RussianStemmer()

def normalize(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    stemmed_tokens = [stemmer.stem(w) for w in tokens if w not in stopwords]    
    return stemmed_tokens
    

### Нормализация документов и запросов

In [9]:
documents = []

for i, d in enumerate(documents_json):
    snippet = d["title"].lower()
    doc_id = df_u.loc[df_u['URL'] == d["url"]]["ID"].values[0]
    if not doc_id % 500:
        print doc_id
        
    doc = NormalizedDocument(doc_id, d["url"], unidecode.unidecode(snippet), 
                             [unidecode.unidecode(x) for x in normalize(snippet)], i)
    documents.append(doc)
    

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500


In [10]:
queries = []

QUERIES_COUNT = len(df_q)

for i in range(QUERIES_COUNT):
    query_contents = unicode(df_q.loc[df_q["ID"] == i]["query"].values[0], "utf-8")
    
    query = NormalizedQuery(i, unidecode.unidecode(query_contents),
                            [unidecode.unidecode(x) for x in normalize(query_contents)])
    queries.append(query)
    

### Инвертированный индекс

In [11]:
class InvIndex(object):
    def __init__(self, words_lists, total_docs):
        self.inv_index_and_frq = dict() # {word: [{doc_order: doc_frq, ...}, global_frequency], ...}
        self.total_docs = total_docs
        
        self._build_inv_index(words_lists)
        
    def get_words(self):
        return self.inv_index_and_frq.keys()
    
    def get_word_stats(self, w):
        if w not in self.inv_index_and_frq:
            return None
        return self.inv_index_and_frq[w]
    
    def get_word_documents(self, w):
        if w not in self.inv_index_and_frq:
            return []
        return self.inv_index_and_frq[w][0].keys()
    
    def get_word_frequency(self, w):
        if w not in self.inv_index_and_frq:
            return 0
        return self.inv_index_and_frq[w][1]
    
    def get_word_frequency_in_doc(self, w, doc_order):
        if w not in self.inv_index_and_frq:
            return 0
        if doc_order not in self.inv_index_and_frq[w][0]:
            return 0
        return self.inv_index_and_frq[w][0][doc_order]
        

    def _build_inv_index(self, words_lists):
        for w_list_ind, w_list in enumerate(words_lists):
            for w in w_list:
                if w not in self.inv_index_and_frq:
                    self.inv_index_and_frq[w] = [dict(), 0]
                
                if w_list_ind not in self.get_word_documents(w):
                    self.inv_index_and_frq[w][0][w_list_ind] = 0
                    self.inv_index_and_frq[w][1] = self.inv_index_and_frq[w][1] + 1
                self.inv_index_and_frq[w][0][w_list_ind] = self.inv_index_and_frq[w][0][w_list_ind] + 1
                

In [12]:
inv_index = InvIndex([d.text_normed for d in documents], len(documents))

# pprint(documents)

# for iw, iw_stat in inv_index.inv_index_and_frq.iteritems():
#     print iw, iw_stat
print "inv_index :\n amount of keys: {}".format(len(inv_index.get_words()))

        

inv_index :
 amount of keys: 12491


### BM25

In [13]:
import math
import time

class RsvParams(object):
    def __init__(self, b, k1, k2, avg_doc_len):
        self.B = b
        self.K1 = k1
        self.K2 = k2
        self.avg_doc_len = avg_doc_len      
        
        
class Searcher(object):
    def __init__(self, rsv_params, inv_index, idf_counter, total_docs):
        self.rsv_params = rsv_params
        self.inv_index = inv_index
        self.search_result = []
        self.idf_counter = idf_counter
        self.total_docs = total_docs
    
    def _match_query(self, query, inv_index):
        return {k: v for k, v in inv_index.inv_index_and_frq.iteritems() if k in query.query_normed}
       
    def _retrieve_docs_from_matched_index(self, matched_index, docs_collection):  
        doc_orders = set()
        for w in matched_index:
            doc_orders.update(matched_index[w][0].keys())
            
        actual_docs = []
        for i in list(doc_orders):
            actual_docs.append(docs_collection[i])
        
        return actual_docs
    
    def _find_docs_via_query(self, documents_param, query):
        matched_index = self._match_query(query, self.inv_index)

        retr_d = self._retrieve_docs_from_matched_index(matched_index, documents_param)
        return retr_d
    
    
  


    def calculate_rsv_for_docs_and_queries(self, docs, queries_param, norm_prop_name, is_normed = False):
        del self.search_result[:]

        for q in queries_param:
            docs_rsv_list = self.calc_rsv_BM25(docs, q, norm_prop_name, is_normed)
            self.search_result.append((q.id, docs_rsv_list))
       
    def calc_rsv_BM25(self, docs, q, norm_prop_name, is_normed = False):
        docs_rsv_list = []
        docs_found = self._find_docs_via_query(docs, q)
        for d in docs_found:
            rsv = self._rsv(q.query_normed, d.list_order, getattr(d, norm_prop_name), len(docs), self.inv_index, is_normed)
            docs_rsv_list.append((d.id, rsv))
        
        return docs_rsv_list
                
    def _rsv(self, query_tokens, doc_order, doc_tokens, total_docs, inv_index, is_normed=False):
        rsv_value = 0
        for token in query_tokens:
            idf_value = self.idf_counter(token, total_docs, inv_index)
            
            tf_value = self._tf(token, doc_order, len(doc_tokens), inv_index)
            
            rsv_value = rsv_value + idf_value * tf_value

        if is_normed:
            idf_sum = sum([self.idf_counter(token, total_docs, inv_index) for token in query_tokens])
            rsv_value = float(rsv_value) / idf_sum

        
        return rsv_value

    def _tf(self, token, doc_order, doc_len, inv_index):
        ftd = inv_index.get_word_frequency_in_doc(token, doc_order)
        tf_ret = float(ftd * (self.rsv_params.K1 + 1)) / (
            ftd + self.rsv_params.K1 * (1 - self.rsv_params.B + self.rsv_params.B * doc_len / self.rsv_params.avg_doc_len))
        return tf_ret
    

In [14]:
import numpy as np
import subprocess

MAX_RELEVANT = 3
avg_doc_len = float(sum([len(d.text_normed) for d in documents])) / len(documents)
B = 0.75
K1 = 1.2

In [15]:
def idf_1(token, total_docs, inv_index):
    N = len(documents)
    Nt = inv_index.get_word_frequency(token)
    idf_ret = math.log10(1 + float(N - Nt + 0.5) / (Nt + 0.5))
    return idf_ret

def idf_2(token, total_docs, inv_index):
    N = len(documents)
    Nt = inv_index.get_word_frequency(token)
    idf_ret = math.log10(float(N + 0.5) / (Nt + 0.5))
    return idf_ret

In [16]:
def select_relevant(queries_docs_rvs):
    relevant_docs = []
    for q in queries_docs_rvs:
        relevant_docs.append((q[0], sorted(
            q[1], key=lambda item: item[1], reverse=True)[:MAX_RELEVANT]))
        
    return relevant_docs

        
    
def search_stats(b_p, k1_p, k2_p, inv_index_p, avg_doc_len, stats, idf_counter, norm_prop_name, is_normed=False):
    rsv_parameters = RsvParams(b_p, k1_p, k2_p, avg_doc_len)
    searcher = Searcher(rsv_parameters, inv_index_p, idf_counter, len(documents))

    searcher.calculate_rsv_for_docs_and_queries(documents, queries, norm_prop_name, is_normed) 
    
    relevant_docs = select_relevant(searcher.search_result)
    
    return relevant_docs


In [17]:
relev_docs = search_stats(B, K1, 0, inv_index, avg_doc_len, [], idf_1, "text_normed")

### Проверка на train (+ запись в файлы)

In [18]:
TRAIN_SIZE = 1000

In [19]:
with open("train_submission.csv", "w") as subm_f:
    for rd in relev_docs[:TRAIN_SIZE]:
        if len(rd[1]) != 3:
            print rd[0], queries[rd[0]], rd[1]
                        
        subm_f.write("{},{},{},{}\n".format(rd[0],
                                                rd[1][0][0] if len(rd[1]) >= 1 else 0,
                                                rd[1][1][0] if len(rd[1]) >= 2 else 0,
                                                rd[1][2][0] if len(rd[1]) >= 3 else 0))
        
with open("test_submission.csv", "w") as subm_f:
    for rd in relev_docs[TRAIN_SIZE:]:
        if len(rd[1]) != 3:
            print rd[0], queries[rd[0]], rd[1]
                        
        subm_f.write("{},{},{},{}\n".format(rd[0],
                                                rd[1][0][0] if len(rd[1]) >= 1 else 0,
                                                rd[1][1][0] if len(rd[1]) >= 2 else 0,
                                                rd[1][2][0] if len(rd[1]) >= 3 else 0))
    

17 query: id=17, text=vko [(7383, 5.361703490639291)]
19 query: id=19, text=orkhideia vikipediia [(12412, 4.152411500006488), (16399, 4.152411500006488)]
26 query: id=26, text=bendi vikipediia [(9352, 3.719409937636851)]
40 query: id=40, text=torrent vikipediia [(9151, 5.361703490639291)]
49 query: id=49, text=indiuk vikipediia [(1896, 4.152411500006488), (1295, 4.152411500006488)]
54 query: id=54, text=iuar vikipediia [(11296, 4.152411500006488), (3714, 3.5164682174490305)]
59 query: id=59, text=gol'shtein [(7491, 5.361703490639291)]
62 query: id=62, text=kamaz vikipediia [(9711, 5.361703490639291)]
83 query: id=83, text=wikipedia []
124 query: id=124, text=boing 747 vikipediia [(5537, 4.152411500006488), (4435, 2.6919285943066744)]
130 query: id=130, text=klubnika vikipediia [(13290, 5.069153503471511), (12205, 5.069153503471511)]
158 query: id=158, text=aviakatastrofy vikipediia []
188 query: id=188, text=zaiats vikipediia [(7118, 4.152411500006488), (3782, 4.152411500006488)]
218 q

In [20]:
print queries[992]

query: id=992, text=bmd vikipediia


In [21]:
filtered = filter(lambda d: d.id in [2035], documents)
print filtered
for tn in filtered[0].text_normed:
    print tn

[doc: id=2035, url=https://ru.wikipedia.org/wiki/Ikarus, snippet=ikarus]
ikarus


In [24]:
from ml_metrics import mapk

def getList(filename):
    with open(filename) as input:
        return [line[:-1].split(',')[1:] for line in input]
    
mapk(getList('KR\\KR\\train_data.csv'), getList('train_submission.csv'),3)

0.42761111111111111