In [2]:
import math
import operator
import urllib

import numpy as np
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer


In [3]:
stemmer = SnowballStemmer("russian")
tokenizer = RegexpTokenizer(r'\w+')


def normalize_line(line):
    line = unicode(line.lower(), "utf-8").replace('_', ' ')

    words = [stemmer.stem(word) for word in tokenizer.tokenize(line) if word not in stopwords.words('russian')]
    if not words:
        return [stemmer.stem(word) for word in tokenizer.tokenize(line)]
    elif len(words) > 1 and u"википед" in words:
        words.remove(u"википед")
        return words
    else:
        return words

In [4]:
documents = {}
N = 0

with open('task3/urlid.csv', 'rb') as file:
    for row in file:
        document = {}
        document['id'] = row[:row.find(',')]
        document['title'] = normalize_line(urllib.unquote(row[row.find(',') + 1:-1]).replace('/wiki/', ''))
   
        documents[document['id']] = document
        N += 1
            
            
L_title = 0
for document_index, document in documents.iteritems():
    L_title += len(document['title'])
    L_title /= float(N)

In [5]:
def create_inverted_index():
    invert_index = {}
    index_field = 'title'
    for document_index, document in documents.iteritems():
        for word in document[index_field]:
            if word in invert_index.keys() and document_index in invert_index[word].keys():
                pass
            else:
                if word not in invert_index:
                    invert_index[word] = {document_index: document[index_field].count(word)}
                else:
                    invert_index[word][document_index] = document[index_field].count(word)
    return invert_index

In [6]:
index = create_inverted_index()

In [7]:
def search_in_index(index, query_words, field_name, b, k1):
    founded_documents_indexes = set()
    for query_word in query_words:
        if query_word in index:
            founded_documents_indexes.update(index[query_word].keys())
 
    documents_with_rsv = {}
    for founded_document in founded_documents_indexes:
        document_rsv = rsv(query_words, documents[founded_document][field_name], index, b, k1)
        documents_with_rsv[founded_document] = document_rsv

    return documents_with_rsv


def rsv(query_words, document_words, inverse_index, b, k1):
    L = L_title

    rsv = 0
    for query_word in query_words:
        if query_word in inverse_index:
            Nt = len(inverse_index[query_word])
            ftd = document_words.count(query_word)
            Ld = len(document_words)           
            rsv += math.log1p(1 + (N + 0.5) / (Nt + 0.5)) * ftd * (k1 + 1)/(k1 * ((1 - b) + b * Ld / L) + ftd)
    return rsv


In [8]:
def prepare_answer(b=0.75, k1=1.2):
    field_name = 'title'
    with open('answer11', 'w') as result_file:
        with open('task3/qid.csv') as file:
            for row in file:
                query_words = normalize_line(row[row.find(',') + 1:-1])    
                documents_with_rsv = search_in_index(index, query_words, field_name, b, k1)
                top3 = [elem[0] for elem in
                         sorted(documents_with_rsv.items(), key=operator.itemgetter(1), reverse=True)[:3]]
                
                result_file.write('%s,%s\n' % (row[:row.find(',')], ','.join(top3)))
            

In [9]:
prepare_answer()