## Prerequisites

*Imports*

In [7]:
import math

In [11]:
import spacy

In [12]:
import nltk

In [13]:
from nltk.corpus import stopwords

## Model Expert AO Class

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bogdan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
!python -m spacy download ro_core_news_sm

Collecting ro-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.4.0/ro_core_news_sm-3.4.0-py3-none-any.whl (12.9 MB)
[+] Download and installation successful
You can now load the package via spacy.load('ro_core_news_sm')


You should consider upgrading via the 'C:\Misc\GIT\DLNLPProject\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [15]:
nlp = spacy.load("ro_core_news_sm")

In [16]:
# ======================================================================================================================
# Class - ModelExpert
# ======================================================================================================================
class ModelExpertAO:
    
    @staticmethod
    def get_TF_v2(bag_of_words, words_freq_dict):
        tf = {}
        words_count = len(bag_of_words)
        for w, frq in words_freq_dict.items():
            tf[w] = frq / float(words_count)
        return tf
    
    @staticmethod
    def get_IDF_v2(words_freq_dict_list):
        docs_count = len(words_freq_dict_list)
        idf = dict.fromkeys(words_freq_dict_list[0].keys(), 0)
        for freq_dict in words_freq_dict_list:
            for w, frq in freq_dict.items():
                if frq > 0:
                    idf[w] += 1
        idf = {w: math.log(docs_count / float(frq)) for w, frq in idf.items()}
        return idf

    @staticmethod
    def get_TF_IDF_v2(tf, idf):
        tf_idf = {w: frq * idf[w] for w, frq in tf.items()}
        return tf_idf

    @staticmethod
    def get_TF_IDF_of_sanitized_words(docs_list: list):
        # TF-IDF of sanitized words
        print(f"\n{docs_list}\n")
        all_words = set().union(*docs_list)
        print(f"\n{all_words}\n")
        words_count_in_docs = [dict.fromkeys(all_words, 0) for _ in docs_list]
        for i in range(len(docs_list)):
            for w in docs_list[i]:
                words_count_in_docs[i][w] += 1
            print(f"\n== Document-{i} ==\n{words_count_in_docs[i]}\n")
        # TF
        all_TF = [ModelExpertAO.get_TF_v2(docs_list[i], words_count_in_docs[i]) for i in range(len(docs_list))]
        print(f"\n== TF ==\n{all_TF}\n")
        # IDF
        IDF = ModelExpertAO.get_IDF_v2(words_count_in_docs)
        print(f"\n== IDF ==\n{IDF}\n")
        # TF-IDF
        all_TF_IDF = [ModelExpertAO.get_TF_IDF_v2(all_TF[i], IDF) for i in range(len(docs_list))]
        print(f"\n== TF-IDF ==\n{all_TF_IDF}\n")
        return all_TF_IDF
    
    #word level lemma method, returns the same text in case of failure (unsupported text input)
    @staticmethod
    def get_lemma(text):
        try:
            doc = nlp(text)
            empty_list = []
            for token in doc:
                empty_list.append(token.lemma_)

            final_string = ' '.join(map(str,empty_list))
            return final_string
        except:
            return text
        
    @staticmethod
    def nltk_stopwords_ro(phrases):
        stopwords = nltk.corpus.stopwords.words('romanian')
        removed_stopwords = [w for w in phrases if not w in stopwords]
        return removed_stopwords

In [20]:
# TF-IDF of sanitized words
# Preprocessing, Bag of Words
# List of lists sanitized words
docs = [
    ['ce', 'fain', 'e', 'afara'],
    ['ce', 'e', 'aceasta']
]
ModelExpertAO.get_TF_IDF_of_sanitized_words(docs)


[['ce', 'fain', 'e', 'afara'], ['ce', 'e', 'aceasta']]


{'e', 'ce', 'aceasta', 'afara', 'fain'}


== Document-0 ==
{'e': 1, 'ce': 1, 'aceasta': 0, 'afara': 1, 'fain': 1}


== Document-1 ==
{'e': 1, 'ce': 1, 'aceasta': 1, 'afara': 0, 'fain': 0}


== TF ==
[{'e': 0.25, 'ce': 0.25, 'aceasta': 0.0, 'afara': 0.25, 'fain': 0.25}, {'e': 0.3333333333333333, 'ce': 0.3333333333333333, 'aceasta': 0.3333333333333333, 'afara': 0.0, 'fain': 0.0}]


== IDF ==
{'e': 0.0, 'ce': 0.0, 'aceasta': 0.6931471805599453, 'afara': 0.6931471805599453, 'fain': 0.6931471805599453}


== TF-IDF ==
[{'e': 0.0, 'ce': 0.0, 'aceasta': 0.0, 'afara': 0.17328679513998632, 'fain': 0.17328679513998632}, {'e': 0.0, 'ce': 0.0, 'aceasta': 0.23104906018664842, 'afara': 0.0, 'fain': 0.0}]



[{'e': 0.0,
  'ce': 0.0,
  'aceasta': 0.0,
  'afara': 0.17328679513998632,
  'fain': 0.17328679513998632},
 {'e': 0.0,
  'ce': 0.0,
  'aceasta': 0.23104906018664842,
  'afara': 0.0,
  'fain': 0.0}]

In [21]:
for phrases in docs:
    for words in phrases:
        print(ModelExpertAO.get_lemma(words))

ce
fain
fi
afara
ce
fi
acesta


In [22]:
for phrases in docs:
    print(ModelExpertAO.nltk_stopwords_ro(phrases))

['fain', 'afara']
[]
