In [1]:
import nltk
import gensim
import spacy
import random
import requests
import pandas              as pd
import numpy               as np
import gensim.downloader   as api
from collections           import Counter, deque
from itertools             import combinations, chain, permutations
from time                  import perf_counter
from functools             import wraps, singledispatch
from gensim.models         import KeyedVectors
from gensim.utils          import simple_preprocess
from dataclasses           import dataclass
from bs4                   import BeautifulSoup

In [2]:
pl_or_en = "pl"

In [3]:
if pl_or_en == "pl":
    nlp = spacy.load("pl_core_news_sm", exclude=["ner"])
    print("spaCy loaded")
    word2vec = KeyedVectors.load("word2vec_100_3_polish.bin")
    print("word2vec loaded")
    
else:
    word2vec = api.load("glove-wiki-gigaword-100")
    print("word2vec loaded")
    nlp = spacy.load("en_core_web_sm", exclude=["ner"])
    print("spaCy loaded")



spaCy loaded
word2vec loaded


In [4]:
@dataclass
class SentenceSeries:
    ds:pd.Series
    
    def __post_init__(self):
        ds_name = self.ds.name
        self.pre_sentences = self.ds.apply(lambda row: " ".join([x for x in row.split() if str(x[0]) !="#"]))
        self.pre_sentences = pd.Series(nlp.pipe(self.ds.to_list())) #spacy pipeline 
        self.sentences = (self.pre_sentences.apply(lambda row: [*row.sents]) #populate series with lists of sentences
                                           .apply(lambda row: [[w.lemma_ for w in list(filter(lambda y: y.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"}, x)) 
                                                                if SentenceSeries.lemma_criteria(w.lemma_)] for x in row]))
        self.sentences.columns = [ds_name]
        self.all_sentences = self.sentences.explode()

        
    
    @staticmethod
    def lemma_criteria(lemma):
        return (len(str(lemma)) > 2
                and len(str(lemma)) < 20
                and str(lemma).isalnum()
                and str(lemma) not in Corpus.stopwords)

In [5]:
class Word:
    
    def __init__(self, word):
        self.word = word
        
    def __call__(self):
        try:
            synonyms = [x[0] for x in word2vec.similar_by_word(self.word, topn=20)]
            print("word2vec synonyms:", synonyms)
            print("-"*50)
        except KeyError:
            synonyms = [self.word]    
        
#         synonyms = {*synonyms, *self.scrape_synonyms(), self.word}
        synonyms = {*self.scrape_synonyms(), self.word}
        synonyms = {x for x in synonyms if len(x) > 2 and x.isalnum() and x not in Corpus.stopwords and x!="kupa"}
        
        for x in sorted(synonyms):
            print(x)
        print("-"*50)
        return {self.word : synonyms}
    
    def synonyms_scraper(self, lang=pl_or_en):
        '''
        API calls for more synonyms/contextually related phrases - synonyms.reverso.net
        '''
        USERAGENTS = ("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", 
        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
        )

        ua = USERAGENTS[random.randrange(len(USERAGENTS))]
        headers = {'user-agent': ua}
        url = f"https://synonyms.reverso.net/synonim/{lang}/{self.word}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        synonyms = [x.text for x in soup.select("a.synonym")][:16]
       
        return synonyms
    
    def scrape_synonyms(self, second_lang="en"):
        synonyms = self.synonyms_scraper()
        if len(synonyms) == 0:
            synonyms = self.synonyms_scraper(lang=second_lang)
        print("Scraped synonyms:", synonyms)
        print("-"*50)
        return synonyms
    

In [29]:
class Corpus:
    
    
    stopwords = {x.split("\n")[0] for x in open(f"stopwords_{pl_or_en}.txt")}
    
    
    def __init__(self, data,*, textcol="Treść wypowiedzi"):
        self.data = data
        self.textcol = textcol
        self.data = self.data.fillna("0")\
                             .drop_duplicates(subset=textcol)\
                             .reset_index(drop=True)
                
        if "Rodzaj wzmianki" in self.data.columns:
            self.data = self.data[~self.data["Rodzaj wzmianki"].isin(["Artykuł"])].reset_index(drop=True)
    
    
    @classmethod
    def from_csv(cls, filename, sample=1000):
        try:
            df = pd.read_csv(f"{filename}.csv", sep=";", nrows=sample)
        except pd.io.parsers.ParserError:
            df = pd.read_csv(f"{filename}.csv", nrows=sample)
        return Corpus(df)
    
    
    @classmethod
    def from_xlsx(cls, filename, sample=1000):
        df = pd.read_excel(f"~/desktop/{filename}.xlsx", nrows=sample, engine='openpyxl')
        return Corpus(df)
    
    
    def __repr__(self):
        return f"A dataset of {len(self.data)} records"
    
    
    def timer(fn):
        @wraps(fn)
        def wrapper(*args, **kwargs):
            s = perf_counter()
            func = fn(*args, **kwargs)
            e = perf_counter()
            print(f"{fn.__name__} ran in {e-s: .2f}s")
            return func
        return wrapper
    
    
    @timer
    def create_lemmata(self):
        sentence_object = SentenceSeries(self.data[self.textcol])
        self.sents = sentence_object.sentences
        self.all_sents = sentence_object.all_sentences.fillna("None")
        self.raw_sentences = pd.Series(sentence_object.pre_sentences.apply(lambda row: [*row.sents]).values)

        
    @property
    def top_words(self):
        return Counter(chain.from_iterable(self.lemmata.values)).most_common(100)
    
    
    def top_ngrams(self, nfunc):
        '''
         c.top_ngrams(nltk.bigrams)
         c.top_ngrams(nltk.trigrams)
        '''
        all_grams = chain.from_iterable(self.lemmata.apply(lambda row : [*nfunc(row)]).values)
        return Counter(all_grams).most_common(100)
    
    
    def filter_similar(self, word):
        filtered = self.lemmata[self.lemmata.apply(lambda x : word in x)]
        return {x[0] for x in Counter(chain.from_iterable(filtered.values)).most_common(10)}
    
    @timer
    def associative_search(self, *words, bigram=False, to_excel=False):
        '''
        For the time being, a fairly convoluted implementation of the actual associative search engine.
        '''
        all_sents_df = pd.DataFrame(self.all_sents)
        all_sents_df.columns = [self.textcol]
        associated_words = {*chain.from_iterable([list(Word(word)().values())[0] for word in words])}
        containing_df = all_sents_df[all_sents_df[self.textcol].apply(lambda x:  any([str(y).lower() in associated_words for y in x]))]
        positions = [*set(containing_df.index)]
        recombined_df = pd.DataFrame(containing_df.groupby(containing_df.index)[self.textcol].apply(lambda x:','.join(x.astype(str))))
        recombined_df["tag"] = recombined_df[self.textcol]
        del recombined_df[self.textcol]
        recombined_df = pd.concat([self.data.take(positions), recombined_df], axis=1)
        recombined_df["tag"] = recombined_df["tag"].apply(lambda row: row.lower())\
                                                   .apply(lambda row: [x for x in associated_words if x in row])
        
        if to_excel:
            recombined_df.to_excel(f"{'_'.join(words)}_associated_search_res.xlsx")
    
#         df = pd.concat([recombined_df, self.sents.apply(lambda x: [*chain.from_iterable(x)])], axis=1)


        return recombined_df
     

In [31]:
c = Corpus.from_csv("leroy", sample=1000)

In [32]:
c.create_lemmata()

create_lemmata ran in  10.97s


In [40]:
result = c.associative_search("szafka", bigram=False, to_excel=False)

word2vec synonyms: ['szafa', 'komoda', 'kredens', 'szuflada', 'skrzynka', 'schowek', 'szafeczka', 'półka', 'stojak', 'lodówka', 'komódka', 'toaletka', 'półeczka', 'kasetka', 'przegródka', 'skrzyneczka', 'regał', 'bieliźniarka', 'pudełko', 'wieszak']
--------------------------------------------------
Scraped synonyms: ['schowek', 'kredens', 'gabinet', 'szafy', 'szafie', 'garderoba', 'szafę', 'szuflada', 'komodzie', 'klozecie', 'firma', 'biuro', 'magazyn', 'skrytka', 'praktyka', 'klinika']
--------------------------------------------------
biuro
gabinet
garderoba
klinika
klozecie
komodzie
kredens
magazyn
praktyka
schowek
skrytka
szafie
szafka
szafy
szafę
szuflada
--------------------------------------------------
associative_search ran in  0.61s


In [41]:
result[["Treść wypowiedzi", 'tag']]

Unnamed: 0,Treść wypowiedzi,tag
33,Post 17. Dzisiaj napiszę o powstawaniu mebli kuchennych :) 🧑‍🍳👩‍🍳. Zrobiliśmy konstrukcje- boki ...,[szuflada]
40,"Zdecydowanie odradzam, meble źle wymierzoneniedopasowane, szary między szafkami dolna i górną a ...",[szafka]
155,Dzień dobry.\n\nDzisiaj mam dla Was magiczną sypialnię w której króluje tapeta od UbierzSwojeŚci...,[szafka]
182,"Witam,\n\nDzisiaj mam dla Was przepiękna wizualizację sypialni przy użyciu tapety od UbierzSwoje...","[szafka, garderoba]"
226,"Witajcie w Naszej bajce 😁💥\n\nDługo zwlekałam, żeby założyć ten profil,ale w końcu się udało! Od...",[praktyka]
292,Dobry Wieczór Obecnym!\n\nW dzisiejszym poście chce Wam przedstawić a raczej pochwalić się jak n...,[szafka]
300,🌎 Od 17 października będziecie mogli zobaczyć w CMWŁ wystawę „Ziemia (P)oddana”. Stanowi ona pod...,[magazyn]
405,Dawno takiego urwania głowy nie miałam i to nie związanego z moją pracą 😊 po nowym roku czekają ...,[szafka]
458,"Najlepiej meble na wymiar. Meble takie będą oczywiście droższe od mebli gotowych, ale kuchni nie...",[szafka]
526,Promocja Noworoczna 🥂\n⠀\nMieszkania w budynkach 6 i 7 są teraz tańsze nawet o 17 000 zł ⚠️\n⠀\n...,[biuro]
