In [1]:
!pip install nltk

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import re
import numpy as np
from collections import Counter
import nltk

### Preprocess

In [3]:
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def preprocess(text):
    text = text.lower() #to lower case
    # dealing with numbers (remmoving)
    text = re.sub(r'[0-9]', ' ', text) # removing
    # removing punctuations, accent marks and other diacritics
    # also removes dots from abbreviations 
    text = re.sub(r'[^\w\s]', '', text)
    # remove unnecessery space symbols
    text = re.sub(r'[\s]+', ' ', text)
    return text

def tokenize(text):
    return word_tokenize(text)

def lemmatization(tokens):
    lemmatizer = WordNetLemmatizer() 
    return [lemmatizer.lemmatize(i) for i in tokens]

def full_preprocess(text):
    text = preprocess(text)
    tokens = tokenize(text)
    tokens = lemmatization(tokens)
    text = ' '.join(tokens)
    return text

[nltk_data] Downloading package wordnet to /apt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /apt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### File parsing

In [6]:
def file_parser(path):
    with open(path, 'r') as f:
        lines = f.read().split('\n');
        texts = []
        for l in lines:
            z = re.match(r'\d+\.\s*(?P<text>.*)', l)
            tmp = z.group('text')
            texts.append(tmp)
        return texts
texts = file_parser('data/facts.txt')
corpus = [full_preprocess(t) for t in texts]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
def make_DTM(texts):
    vectorizer = TfidfVectorizer()
    res = vectorizer.fit_transform(texts)
    return res.todense(), vectorizer
DTM, vect = make_DTM(corpus)

In [8]:
from sklearn.decomposition import PCA
pca = PCA(n_components=105)
latent = pca.fit_transform(DTM)
pca.explained_variance_ratio_.sum()
proj = pca.inverse_transform(latent)


In [9]:
latent.shape

(159, 105)

In [10]:
def query_process(query, vectorizer, pca):
    query = full_preprocess(query)
    vector = np.zeros(len(vectorizer.get_feature_names()))
    '''
    for t in query.split():
        if t in vect.get_feature_names():
    '''     
    
    counts = Counter(query.split())
    for c in counts:
        try:
            vector[vectorizer.get_feature_names().index(c)] += counts[c]
        except:
            pass
    vector = vector * vectorizer.idf_
    compressed_q = pca.transform(vector.reshape(1, -1))
    return compressed_q

In [11]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
def search(latent, compressed_q):
    values = cosine_similarity(latent, compressed_q).reshape(1, -1)[0]
    args = values.argsort()[::-1]
    ranking = pd.DataFrame()
    for i in range(len(args)):
        ranking.loc[i, 'Doc number'] = str(int(args[i]))
        ranking.loc[i,'Content'] = texts[args[i]]
        ranking.loc[i, 'Similarity'] = values[args[i]]
    return ranking

In [43]:
query = 'pigs'
compressed_q = query_process(query, vect, pca)
ranking = search(latent, compressed_q)
ranking.head()

Unnamed: 0,Doc number,Content,Similarity
0,39,It is physically impossible for pigs to look u...,0.823374
1,7,In 1386 a pig in France was executed by public...,0.679172
2,148,It is impossible to sneeze with your eyes open.,0.151448
3,40,Guinness Book of Records holds the record for ...,0.078429
4,3,"The largest recorded snowflake was in Keogh, M...",0.072806


In [13]:
query = 'people'
compressed_q = query_process(query, vect, pca)
ranking = search(latent, compressed_q)
ranking.head()

Unnamed: 0,Doc number,Content,Similarity
0,112,Cows kill more people than sharks do.,0.535209
1,37,95% of people text things they could never say...,0.468248
2,101,Blue-eyed people tend to have the highest tole...,0.468049
3,104,Every year more than 2500 left-handed people a...,0.436959
4,45,About 150 people per year are killed by coconuts.,0.434098


In [14]:
query = 'English words'
compressed_q = query_process(query, vect, pca)
ranking = search(latent, compressed_q)
ranking.head()

Unnamed: 0,Doc number,Content,Similarity
0,97,"Of all the words in the English language, the ...",0.839905
1,58,The word “gorilla” is derived from a Greek wor...,0.767241
2,29,Bob Marley’s last words to his son before he d...,0.358859
3,122,Dogs are capable of understanding up to 250 wo...,0.333466
4,143,What is called a “French kiss” in the English-...,0.313644


In [15]:
print(ranking[ranking['Doc number'] == '58']['Content'].item())

The word “gorilla” is derived from a Greek word meaning, “A tribe of hairy women.”


  """Entry point for launching an IPython kernel.


In [16]:
! pip install yandex-translater
import locale
from yandex.Translater import Translater
locale.setlocale(locale.LC_ALL, '')

api_key = open("yandex.translate.key").read()   # todo your key in the file
tr = Translater()

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [50]:
def translate(tr, x):
    tr.set_text(x)
    return tr.translate()

def not_english_query_processing(q, langs, vect, pca, latent):
    tr = Translater()
    tr.set_key(api_key)
    tr.set_to_lang('en')
    tr.set_hint(*langs)
    tr.set_text(q)
    lang = tr.detect_lang()
    print(lang)
    result_query = tr.translate()
    print(result_query)
    compressed_q = query_process(result_query, vect, pca)
    ranking = search(latent, compressed_q)
    tr_inv = Translater()
    tr_inv.set_key(api_key)
    tr_inv.set_to_lang(lang)
    tr_inv.set_from_lang('en')
    translated = ranking[:6]
    translated['Content'] = translated['Content'].apply(lambda x: translate(tr_inv, x))
    return translated

In [51]:
result = not_english_query_processing('свиньи', ['ru', 'es', 'tt', 'en'], vect, pca, latent)

ru
pigs


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [52]:
result

Unnamed: 0,Doc number,Content,Similarity
0,39,"Это физически невозможно для свиней, чтобы смо...",0.823374
1,7,В 1386 г. свинья во Франции был казнен путем п...,0.679172
2,148,Невозможно чихнуть с открытыми глазами.,0.151448
3,40,Книге рекордов Гиннесса принадлежит рекорд за ...,0.078429
4,3,Самая большая зарегистрированная снежинка была...,0.072806
5,143,"То, что называется “французским поцелуем” в ан...",0.071106
