In [1]:
import nltk
import gensim
import spacy
import random
import requests
import pandas              as pd
import numpy               as np
import gensim.downloader   as api
from collections           import Counter, deque
from itertools             import combinations, chain, permutations
from time                  import perf_counter
from functools             import wraps, singledispatch
from gensim.models         import KeyedVectors
from gensim.utils          import simple_preprocess
from dataclasses           import dataclass
from bs4                   import BeautifulSoup

In [3]:
pl_or_en = "pl"

In [4]:
if pl_or_en == "pl":
    nlp = spacy.load("pl_core_news_sm", exclude=["ner"])
    print("spaCy loaded")
    word2vec = KeyedVectors.load("word2vec_100_3_polish.bin")
    print("word2vec loaded")
    
else:
    word2vec = api.load("glove-wiki-gigaword-100")
    print("word2vec loaded")
    nlp = spacy.load("en_core_web_sm", exclude=["ner"])
    print("spaCy loaded")

spaCy loaded
word2vec loaded


In [9]:
@dataclass
class SentenceSeries:
    ds:pd.Series
    
    def __post_init__(self):
        ds_name = self.ds.name
        self.pre_sentences = self.ds.apply(lambda row: " ".join([x for x in row.split() if str(x[0]) !="#"]))
        self.pre_sentences = pd.Series(nlp.pipe(self.ds.to_list())) #spacy pipeline 
        self.sentences = (self.pre_sentences.apply(lambda row: [*row.sents]) #populate series with lists of sentences
                                           .apply(lambda row: [[w.lemma_ for w in list(filter(lambda y: y.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"}, x)) 
                                                                if SentenceSeries.lemma_criteria(w.lemma_)] for x in row]))
        self.sentences.columns = [ds_name]
        self.all_sentences = self.sentences.explode()
#         self.proper_nouns = self.pre_sentences.apply(lambda row: [x for x in row if x.dep_ =="nsubj"])
        
    
    @staticmethod
    def lemma_criteria(lemma):
        return (len(str(lemma)) > 2
                and len(str(lemma)) < 20
                and str(lemma).isalnum()
                and str(lemma) not in Corpus.stopwords)

In [10]:
class Word:
    
    def __init__(self, word):
        self.word = word
        
    def __call__(self):
        try:
            synonyms = [x[0] for x in word2vec.similar_by_word(self.word, topn=20)]
            print("word2vec synonyms:", synonyms)
            print("-"*50)
        except KeyError:
            synonyms = [self.word]    
        
#         synonyms = {*synonyms, *self.scrape_synonyms(), self.word}
        synonyms = {*self.scrape_synonyms(), self.word}
        synonyms = {x for x in synonyms if len(x) > 2 and x.isalnum() and x not in Corpus.stopwords and x!="kupa"}
        
        for x in sorted(synonyms):
            print(x)
        print("-"*50)
        return {self.word : synonyms}
    
    def synonyms_scraper(self, lang=pl_or_en):
        '''
        API calls for more synonyms/contextually related phrases - synonyms.reverso.net
        '''
        USERAGENTS = ("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", 
        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
        )

        ua = USERAGENTS[random.randrange(len(USERAGENTS))]
        headers = {'user-agent': ua}
        url = f"https://synonyms.reverso.net/synonim/{lang}/{self.word}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        synonyms = [x.text for x in soup.select("a.synonym")][:16]
       
        return synonyms
    
    def scrape_synonyms(self, second_lang="en"):
        synonyms = self.synonyms_scraper()
        if len(synonyms) == 0:
            synonyms = self.synonyms_scraper(lang=second_lang)
        print("Scraped synonyms:", synonyms)
        print("-"*50)
        return synonyms
    

In [59]:
class Corpus:
    
    stopwords = {x.split("\n")[0] for x in open(f"stopwords_{pl_or_en}.txt")}
    
    def __init__(self, data,*, textcol="Treść wypowiedzi"):
        self.data = data
        self.textcol = textcol
        self.data = self.data.fillna("0")\
                             .drop_duplicates(subset=textcol)\
                             .reset_index(drop=True)
                
        if "Rodzaj wzmianki" in self.data.columns:
            self.data = self.data[~self.data["Rodzaj wzmianki"].isin(["Artykuł"])].reset_index(drop=True)
        
    @classmethod
    def from_csv(cls, filename, sample=1000):
        try:
            df = pd.read_csv(f"{filename}.csv", sep=";", nrows=sample)
        except pd.io.parsers.ParserError:
            df = pd.read_csv(f"{filename}.csv", nrows=sample)
        return Corpus(df)
    
    @classmethod
    def from_xlsx(cls, filename, sample=1000):
        df = pd.read_excel(f"~/desktop/{filename}.xlsx", nrows=sample, engine='openpyxl')
        return Corpus(df)
        
    def __repr__(self):
        return f"A dataset of {len(self.data)} records"
    
    def timer(fn):
        @wraps(fn)
        def wrapper(*args, **kwargs):
            s = perf_counter()
            func = fn(*args, **kwargs)
            e = perf_counter()
            print(f"{fn.__name__} ran in {e-s: .2f}s")
            return func
        return wrapper
    
    @timer
    def create_lemmata(self):
        sentence_object = SentenceSeries(self.data[self.textcol])
        self.sents = sentence_object.sentences
        self.all_sents = sentence_object.all_sentences.fillna("None")
        self.raw_sentences = pd.Series(sentence_object.pre_sentences.apply(lambda row: [*row.sents]).values)

    @property
    def top_words(self):
        return Counter(chain.from_iterable(self.lemmata.values)).most_common(100)
    
    def top_ngrams(self, nfunc):
        '''
         c.top_ngrams(nltk.bigrams)
         c.top_ngrams(nltk.trigrams)
        '''
        all_grams = chain.from_iterable(self.lemmata.apply(lambda row : [*nfunc(row)]).values)
        return Counter(all_grams).most_common(100)
    
    def filter_similar(self, word):
        filtered = self.lemmata[self.lemmata.apply(lambda x : word in x)]
        return {x[0] for x in Counter(chain.from_iterable(filtered.values)).most_common(10)}
    
    @timer
    def associative_search(self, *words, bigram=False, to_excel=False):
        all_sents_df = pd.DataFrame(self.all_sents)
        all_sents_df.columns = [self.textcol]
        associated_words = {*chain.from_iterable([list(Word(word)().values())[0] for word in words])}
        containing_df = all_sents_df[all_sents_df[self.textcol].apply(lambda x:  any([str(y).lower() in associated_words for y in x]))]
        positions = [*set(containing_df.index)]
        recombined_df = pd.DataFrame(containing_df.groupby(containing_df.index)[self.textcol].apply(lambda x:','.join(x.astype(str))))
        recombined_df["tag"] = recombined_df[self.textcol]
        del recombined_df[self.textcol]
        recombined_df = pd.concat([self.data.take(positions), recombined_df], axis=1)
        recombined_df["tag"] = recombined_df["tag"].apply(lambda row: row.lower())\
                                                   .apply(lambda row: [x for x in associated_words if x in row])
        
        if to_excel:
            recombined_df.to_excel(f"{'_'.join(words)}_associated_search_res.xlsx")
    
        df = pd.concat([recombined_df, self.sents.apply(lambda x: [*chain.from_iterable(x)])], axis=1)
        df["pff"] = df[0]
        

        return df
     

In [60]:
c = Corpus.from_csv("alle2", sample=1000)

In [61]:
c.create_lemmata()

create_lemmata ran in  16.19s


In [89]:
p = c.associative_search("kwota", "prowizja", bigram=False, to_excel=False)

word2vec synonyms: ['suma', 'prowizja', 'sum', 'sumka', 'zaliczka', 'honorarium', 'wpłata', 'należności', 'wynagrodzenie', 'opłata', 'odsetka', 'należność', 'wypłata', 'kredyt', 'dywidenda', 'wydatek', 'pożyczka', 'premia', 'gotówka', 'dopłata']
--------------------------------------------------
Scraped synonyms: ['wielkość', 'wartość', 'suma', 'liczba', 'ilość', 'wysokość', 'stawka', 'przydział', 'cyfra', 'parytet', 'objętość', 'zawartość', 'dawka', 'numer', 'stopień', 'dotacja']
--------------------------------------------------
cyfra
dotacja
kwota
numer
objętość
parytet
przydział
stawka
stopień
wartość
wielkość
wysokość
zawartość
--------------------------------------------------
word2vec synonyms: ['kwota', 'zaliczka', 'należności', 'honorarium', 'opłata', 'należność', 'wynagrodzenie', 'pożyczka', 'premia', 'dywidenda', 'subsydium', 'zniżka', 'suma', 'kredyt', 'pensja', 'łapówka', 'gotówka', 'wpisowe', 'ryczałt', 'czynsz']
--------------------------------------------------
Scraped 

In [93]:
pp

Unnamed: 0,tag,pff
4,[wysokość],"[pieniężną, wysokość, karta, podarunkowy, wyko..."
13,[wysokość],"[okej, stojak, ławke, stojak, składany, wysoko..."
16,[zapłata],"[handel, roślina, opłacone, zamówienie, odebra..."
17,[wartość],"[loteria, promocyjny, karta, podarunkowy, wyko..."
27,[kwota],"[allegro, olx, zainteresowany, zakup, sklep, k..."
52,"[prowizja, kwota]","[micun, wyglada, praca, kurierka, pracować, wo..."
58,[numer],"[założony, regulator, wersja, 163km, Kupa, num..."
62,[kwota],"[spóźnić, kupować, 2060s, śledzić, allegro, ko..."
120,[numer],"[szy, podanie, numer, komórkowy, odbierać, wys..."
121,[wartość],"[tomasz3, niunia, hala, halo, lud, kasować, ko..."
