In [1]:
import pandas as pd
import numpy as np
import io
from tqdm import tqdm
from itertools import islice
import langid
import re
from concurrent import futures
# import swifter
import asyncio
import nltk
# import vaex
from nltk import  pos_tag
from nltk import WordNetLemmatizer
# import modin.pandas as pdmd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import multiprocessing as mp


import warnings
warnings.filterwarnings("ignore")

In [2]:
path = "/content/drive/MyDrive/Projects/ml_tinkoff_2021/notebooks/lesson_9_info_search/hw/"

In [3]:
def is_en(text):
    return text if langid.classify(text)[0] == 'en' else pd.NA

In [4]:
data = pd.read_csv(path + "github_issues_slice.csv")[:25_000]

In [5]:
with futures.ThreadPoolExecutor() as executor:
    data.body = pd.Series(executor.map(is_en, data.body))

In [6]:
len(data[data.body.isnull()])

1642

In [7]:
data = data.dropna()

In [8]:
data

Unnamed: 0,issue_url,issue_title,body
0,"""https://github.com/atais/angular-eonasdan-dat...",manually entered dates issues,i use format 'yyyy-mm-dd' option and bind ng-m...
1,"""https://github.com/conveyal/analysis-ui/issue...",highlight segment on map when editing speed,when editing the speed of a segment i.e. when ...
2,"""https://github.com/Tat5ato/Phantasmic-Mind/is...",concept art for the otherworld,"in general, the otherworld is craggy and organ..."
3,"""https://github.com/qmlweb/qmlweb/issues/420""",mousearea and touch event in mobile browser,"hello, in the master branch, mousearea doesn't..."
4,"""https://github.com/pybel/pybel/issues/174""","function to drop graph store and edge store, b...",this function should go in the cache manager. ...
...,...,...,...
24995,"""https://github.com/systemd/systemd/issues/5516""",journald: definitely lost: 100 bytes in 2 blocks,==181== 50 bytes in 1 blocks are definitely lo...
24996,"""https://github.com/O3Labs/O3Android/issues/21""",tap to view asset detail,- account screen - portfolio screen shows neo ...
24997,"""https://github.com/cdnjs/cdnjs/issues/10208""",request change plottable.js to plottable,"hey cdnjs ppl! starting with plottable 2.6, ca..."
24998,"""https://github.com/ndhays/redux-cablecar/issu...",use within a webpack environment?,do you have a suggested way to define / provid...


In [9]:
data = data.reset_index(drop=True)

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
def sentence_segmentation(text: str) -> list:
    if re.match(r'[\.!\?;]', text[-1]):
        text = text[:-1]
    return re.split(r'[\.!\?;]\s', text.lower())

In [12]:
def tokenization(sentences: list) -> list:
    return [re.split(r'[,:(\s\-)]*\s', s) for s in sentences]

In [13]:
from nltk.corpus import wordnet as wn

In [14]:
def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wn.ADJ,
        'V': wn.VERB,
        'N': wn.NOUN,
        'R': wn.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wn.NOUN

In [15]:
def lemmatization(sentences: list) -> list:
    sentences_tag  = [pos_tag(s) for s in sentences] # получаем теги слов каждого предложения
    
    lemmatizer = WordNetLemmatizer()
    lemm_sentences = []
    for sent in sentences_tag:
        pos_tagged = [(word, get_wordnet_pos(tag)) for word, tag in sent]
        lemm_sentences.append([lemmatizer.lemmatize(word, tag) for word, tag in pos_tagged])

    return lemm_sentences

In [16]:
stop_words = set(stopwords.words('english')).union({'', ' '})

In [17]:
def del_stopwords(sentences: list) -> list:
    upd_sentences = []
    re_sub = lambda x: re.sub(r"[\+=\t\r\n,;:\*'\"]+","", x)
    union_sentences = lambda x: list(set().union(*x))

    for sent in sentences:
        upd_sentences.append([
            re_sub(word) for word in sent if re_sub(word) not in stop_words and len(word) not in [1, 2]
        ])
    
    return union_sentences(upd_sentences)

In [18]:
def preprocessing_text(text: str) -> list:
      return del_stopwords(lemmatization(tokenization(sentence_segmentation(text))))

In [20]:
with futures.ThreadPoolExecutor() as executor:
    data['words_body'] = pd.Series(executor.map(preprocessing_text, data.body))

In [21]:
with futures.ThreadPoolExecutor() as executor:
    data['words_title'] = pd.Series(executor.map(preprocessing_text, data.issue_title))

In [22]:
def load_vectors(fname, limit):
  fin = io.open(fname, 'r', encoding = 'utf-8', newline = '\n', errors = 'ignore')
  n, d = map(int, fin.readline().split())
  data = {}
  for line in tqdm(islice(fin, limit), total = limit):
    tokens = line.rstrip().split(' ')
    data[tokens[0]] = np.array(list(map(float, tokens[1:])))
  return data

vecs = load_vectors(path+'crawl-300d-2M.vec', 1_000_000)

100%|██████████| 1000000/1000000 [01:31<00:00, 10925.78it/s]


In [23]:
def dummy_fun(doc):
    return doc

In [24]:
tfidf_body = TfidfVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None)
tfidf_body.fit(data.words_body)

tfidf_title = TfidfVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None)
tfidf_title.fit(data.words_title)

TfidfVectorizer(preprocessor=<function dummy_fun at 0x7f0ae89edb00>,
                token_pattern=None,
                tokenizer=<function dummy_fun at 0x7f0ae89edb00>)

In [25]:
def vectorization_body(words: list):
    return tfidf_body.transform([words]).toarray().squeeze()

def vectorization_title(words: list):
    return tfidf_title.transform([words]).toarray().squeeze()

In [None]:
with futures.ThreadPoolExecutor() as executor:
    data['vectors_body'] = pd.Series(executor.map(vectorization_body, data.words_body))

In [None]:
with futures.ThreadPoolExecutor() as executor:
    data['vectors_title'] = pd.Series(executor.map(vectorization_title, data.words_title))

In [None]:
dim = 300
zero = sum(vecs.values()) / len(vecs)

In [None]:
vocab_body = np.zeros((len(tfidf_body.vocabulary_.keys()), dim))
for key in tqdm(tfidf_body.vocabulary_.keys()):
  vocab_body[tfidf_body.vocabulary_[key]] = vecs.get(key, zero)

In [None]:
vocab_title = np.zeros((len(tfidf_title.vocabulary_.keys()), dim))
for key in tqdm(tfidf_title.vocabulary_.keys()):
  vocab_title[tfidf_title.vocabulary_[key]] = vecs.get(key, zero)

In [None]:
data['vectors_body'] = np.array(data.vectors_body.tolist()).dot(vocab_body).tolist()
data['vectors_title'] = np.array(data.vectors_title.tolist()).dot(vocab_title).tolist()

In [None]:
data.to_pickle(path+'data_temp')