# Информационный поиск

Заменяем текст запроса по заданию

In [None]:
QUERIES = ['electronic computer', 'surface heat']

Скачиваем данные

In [None]:
! wget -q http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz
! tar -xvf cran.tar.gz
! rm cran.tar.gz*
! grep -v "^\." cran.qry > just.qry
! head -3 just.qry
! pip install -q scikit-learn==0.22.2.post1

cran.all.1400
cran.qry
cranqrel
cranqrel.readme
what similarity laws must be obeyed when constructing aeroelastic models
of heated high speed aircraft .
what are the structural and aeroelastic problems associated with flight
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for scikit-learn (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imbalanced-learn 0.10.1 requires scikit-learn>=1.0.2, but you have scikit-learn 0.22.2.post1 which is incompatible.
sklearn-pandas 2.2.0 requires scikit-learn>=0.23.0, but you have scikit-learn 0.22.2.post1 which is incompatible.
yellowbrick 1.5 requires scikit-learn>=1.0.0, but you have scikit-learn 0.22.2.post1 which is incompatible.[0m[31m
[0m

Запускаем код

In [None]:
from  sklearn.feature_extraction.text import CountVectorizer
from  sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scipy.spatial.distance as ds 
import warnings

In [None]:
warnings.filterwarnings("ignore")

def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
  return ds.cosine(vector_a, vector_b)

#Проверка, что функция работает правильно
assert cosine_distance(np.array([1, 0, 1, 1, 1]), np.array([0, 0, 1, 0, 0])) == 0.5

def jaccard_sim(vector_a: np.array, vector_b: np.array) -> float:
  vector_a = np.asarray(vector_a, np.bool)
  vector_b = np.asarray(vector_b, np.bool)
  return np.double(np.bitwise_and(vector_a, vector_b).sum()) / np.double(np.bitwise_or(vector_a, vector_b).sum())

#Проверка, что функция работает правильно
assert jaccard_sim(np.array([1, 0, 1, 0, 1]), np.array([0, 1, 1, 1, 1])) == 0.4

In [None]:
raw_query_data = [line.strip() for line in open("just.qry", "r").readlines()]
query_data = [""]

for query_part in raw_query_data:
  query_data[-1] += query_part + " "
  if query_part.endswith("."):
    query_data.append("")

encoder = CountVectorizer(binary=True)
encoded_data = encoder.fit_transform(query_data)
encoded_queries = encoder.transform(QUERIES)

id2term = {idx: term for term, idx in encoder.vocabulary_.items()}
non_zero_values_ids = encoded_data[0].nonzero()[1]

terms = [id2term[idx] for idx in non_zero_values_ids]

print("По мера Жаккара:")
for q_id, query in enumerate(encoded_queries):
  query = query.todense().A1
  docs = [doc.todense().A1 for doc in encoded_data]
  id2doc2similarity = [(doc_id, doc, jaccard_sim(query, doc)) for doc_id, doc in enumerate(docs)]
  closest = sorted(id2doc2similarity, key=lambda x: x[2], reverse=True)
  
  print("Q: %s:" %(QUERIES[q_id]))
  print("    %s\t%s" %("ID", "Коэффициент"))
  for closest_id, _, sim in closest[:2]:
    print("    %d\t%.2f" %(closest_id, sim))

print()

# Второе задание
tfidf_encoder = TfidfVectorizer()
tfidf_encoded_data = tfidf_encoder.fit_transform(query_data)
tfidf_encoded_queries = tfidf_encoder.transform(QUERIES)

print("Косинустное расстояние:")
for q_id, query in enumerate(tfidf_encoded_queries):
  
  # приводим к нужному типу
  query = query.todense().A1
  docs = [doc.todense().A1 for doc in tfidf_encoded_data]
  # Косинусное расстояние
  id2doc2similarity = [(doc_id, doc, cosine_distance(query, doc)) \
                       for doc_id, doc in enumerate(docs)]
  # сортируем по нему
  closest = sorted(id2doc2similarity, key=lambda x: x[2], reverse=False)
  
  print("Q: %s\nFOUND:" % QUERIES[q_id])
  
  for closest_id, _, sim in closest[:3]:
    print("    %d\t%.2f\t%s" %(closest_id, sim, query_data[closest_id]))

По мера Жаккара:
Q: electronic computer:
    ID	Коэффициент
    15	0.12
    128	0.08
Q: surface heat:
    ID	Коэффициент
    45	0.14
    8	0.11

Косинустное расстояние:
Q: electronic computer
FOUND:
    226	0.00	
    15	0.53	can the transverse potential flow about a body of revolution be calculated efficiently by an electronic computer . 
    128	0.76	has anyone programmed a pump design method for a high-speed digital computer . 
Q: surface heat
FOUND:
    226	0.00	
    45	0.56	what is the combined effect of surface heat and mass transfer on hypersonic flow . 
    44	0.76	has anyone investigated the effect of surface mass transfer on hypersonic viscous interactions . 
