In [None]:
import pandas as pd
import numpy as np
import json
import os


import sklearn
import sklearn.svm
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt

import datasets

import joblib
import utilsLib as utils
from dotenv import load_dotenv

In [None]:
load_dotenv()
HF_TOKEN = os.environ['HF_TOKEN']
DATASET_LINK = os.environ['DATASET_LINK']



In [None]:
# Load encodings
ENCODING_PATH = 'utils'
ENCODINGS_NAME = {
    "subcategory":"subcategoriesAliases.json",
    "category":"categoriesAliases.json",
    "label":"labelsAliases.json",
    "type":"typesAliases.json", 
}

# Ensemble Submodules paths
MODELS_PATH = "pretrained_models"
SVC_CATEGORIES_NAME = 'categorySVCWeights.pkl'
SVC_DESCRIPTION_NAME = 'DescriptionSVM.pkl'
SVM_PARAGRAPH_NAME = 'PenaltySVM_OOV.pkl'



In [None]:
encodings = dict()
for key in ENCODINGS_NAME:
    with open(f'{ENCODING_PATH}/{ENCODINGS_NAME[key]}', 'r') as file:
        encodings[key] = json.load(file)

In [5]:
datasetTotal = datasets.load_dataset(DATASET_LINK, token=HF_TOKEN)
datasetRaw = datasetTotal['validation'].to_pandas()

In [6]:
def baseInfoDataset(dataset, encodings, debug=False):
    subcategoriesColumn = np.zeros((dataset.shape[0], len(encodings['subcategory'].keys())), dtype=int)
    categoriesColumn = np.zeros((dataset.shape[0], len(encodings['category'].keys())), dtype=int)
    typesColumn = np.zeros((dataset.shape[0], len(encodings['type'].keys())), dtype=int)
    for i in range(dataset.shape[0]):
        try:
            subcategoriesColumn[i][encodings['subcategory'][dataset['subcategory'].iloc[i]]] = 1
        except KeyError:
            pass
        try:
            categoriesColumn[i][encodings['category'][dataset['category'].iloc[i]]] = 1
        except KeyError:
            pass
        try:
            typesColumn[i][encodings['type'][dataset['type'].iloc[i]]] = 1
        except KeyError:
            pass
    if debug:
        print(subcategoriesColumn.shape)
        print(categoriesColumn.shape)
        print(typesColumn.shape)
    
    return np.concatenate(
        (subcategoriesColumn,categoriesColumn,typesColumn),
        axis=1
    )

In [7]:
baseData = baseInfoDataset(datasetRaw, encodings)
print(baseData.shape)

(300, 133)


In [8]:
paragraphCache = {'data': None}

In [9]:
print(paragraphCache['data'])

None


In [12]:
def paragraphDataset(dataset, trainset, cache=paragraphCache):
    UNK_TOKEN = '<UNK>'
    target = dataset.copy()
    columsToEncode = ['type', 'subcategory']
    encoder = OneHotEncoder(handle_unknown='ignore')
    processor = utils.TextProcessor('utils/mostInfluentWords.json', 
                                            removed=['ˈ', 'also', 'since', 'many'],
                                            unkonwn=UNK_TOKEN)
   
    encoder.fit_transform(trainset[columsToEncode])
    encodingTS = encoder.transform(target[['type', 'subcategory']]).toarray().astype(float)
   
    if cache['data'] is None:
        paragraphs = utils.getText(target, lang='en', max_workers=10)
        cache['data'] = paragraphs
    else:
        paragraphs = cache['data']
    target['paragraph'] = paragraphs
   
    tfidfColumn = processor.process(target, column='paragraph')
    
    
    
    vocab = sorted(set(word for doc in target['paragraph'] for word in processor.tokenize(doc)))
    word_index = {word: idx for idx, word in enumerate(vocab)}
    word_index[UNK_TOKEN] = len(word_index)
    
    encodingTFIDF = np.array([processor.vectorize(doc_tfidf, word_index) for doc_tfidf in tfidfColumn])
    
    return np.hstack([encodingTFIDF, encodingTS])

In [13]:
paragraphData = paragraphDataset(datasetRaw, datasetTotal['train'].to_pandas())
print(paragraphData.shape)

100%|██████████| 439/439 [00:00<00:00, 94951.50it/s]
100%|██████████| 505/505 [00:00<00:00, 247531.09it/s]
100%|██████████| 428/428 [00:00<00:00, 197054.02it/s]
100%|██████████| 464/464 [00:00<00:00, 158662.73it/s]
100%|██████████| 520/520 [00:00<00:00, 226530.75it/s]
100%|██████████| 516/516 [00:00<00:00, 507446.86it/s]
100%|██████████| 511/511 [00:00<00:00, 600361.16it/s]
100%|██████████| 392/392 [00:00<?, ?it/s]
100%|██████████| 440/440 [00:00<00:00, 439927.00it/s]
100%|██████████| 413/413 [00:00<?, ?it/s]
100%|██████████| 586/586 [00:00<00:00, 160298.84it/s]
100%|██████████| 414/414 [00:00<?, ?it/s]
100%|██████████| 489/489 [00:00<?, ?it/s]
100%|██████████| 548/548 [00:00<?, ?it/s]
100%|██████████| 435/435 [00:00<?, ?it/s]
100%|██████████| 449/449 [00:00<00:00, 444370.57it/s]
100%|██████████| 453/453 [00:00<?, ?it/s]
100%|██████████| 364/364 [00:00<00:00, 490751.09it/s]
100%|██████████| 532/532 [00:00<?, ?it/s]
100%|[32m██████████[0m| 300/300 [00:00<00:00, 865.77it/s] 


(300, 5310)


In [14]:
def descriptionDataset(dataset, trainset):
    UNK_TOKEN = '<UNK>'
    target = dataset.copy()
    columsToEncode = ['type', 'subcategory']
    encoder = OneHotEncoder(handle_unknown='ignore')
    processor = utils.TextProcessor(None, removed=['ˈ', 'also', 'since', 'many'],
                                    unkonwn=UNK_TOKEN)
   
    encoder.fit_transform(trainset[columsToEncode])
    encodingTS = encoder.transform(target[['type', 'subcategory']]).toarray().astype(float)
   
    tfidfColumn = processor.process(target, column='description')
    
    vocab = sorted(set(word for doc in target['description'] for word in processor.tokenize(doc)))
    word_index = {word: idx for idx, word in enumerate(vocab)}
    word_index[UNK_TOKEN] = len(word_index)
    
    encodingTFIDF = np.array([processor.vectorize(doc_tfidf, word_index) for doc_tfidf in tfidfColumn])
    
    return np.hstack([encodingTFIDF, encodingTS])
    

In [15]:
descriptionData = descriptionDataset(datasetRaw, datasetTotal['train'].to_pandas())
print(descriptionData.shape)

100%|██████████| 88/88 [00:00<?, ?it/s]
100%|██████████| 46/46 [00:00<?, ?it/s]
100%|██████████| 61/61 [00:00<?, ?it/s]
100%|██████████| 49/49 [00:00<?, ?it/s]
100%|██████████| 104/104 [00:00<00:00, 104281.05it/s]
100%|██████████| 44/44 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 62/62 [00:00<?, ?it/s]
100%|██████████| 72/72 [00:00<?, ?it/s]
100%|██████████| 42/42 [00:00<00:00, 117128.17it/s]
100%|██████████| 78/78 [00:00<?, ?it/s]
100%|██████████| 61/61 [00:00<?, ?it/s]
100%|██████████| 55/55 [00:00<?, ?it/s]
100%|██████████| 67/67 [00:00<?, ?it/s]
100%|██████████| 87/87 [00:00<?, ?it/s]
100%|██████████| 61/61 [00:00<?, ?it/s]
100%|██████████| 39/39 [00:00<?, ?it/s]
100%|██████████| 48/48 [00:00<00:00, 36927.11it/s]
100%|██████████| 86/86 [00:00<?, ?it/s]
100%|[32m██████████[0m| 300/300 [00:00<00:00, 5399.28it/s]

(300, 1037)





In [19]:
labels = datasetRaw['label'].apply(lambda x: encodings['label'][x])
print(labels)

0      2
1      0
2      2
3      0
4      2
      ..
295    1
296    2
297    0
298    2
299    0
Name: label, Length: 300, dtype: int64


In [16]:
svcBaseInfo = joblib.load(f"{MODELS_PATH}/{SVC_CATEGORIES_NAME}")
svcDescription = joblib.load(f"{MODELS_PATH}/{SVC_DESCRIPTION_NAME}")
svcParagraph = joblib.load(f"{MODELS_PATH}/{SVM_PARAGRAPH_NAME}")

In [17]:
predictionsBaseInfo = svcBaseInfo.predict_proba(baseData)
predictionsParagraph = svcParagraph.predict_proba(paragraphData)
predictionsDescription = svcDescription.predict_proba(descriptionData)

AttributeError: This 'SVC' has no attribute 'predict_proba'

In [None]:
probabilitiesFinal = np.stack((predictionsBaseInfo, predictionsParagraph, predictionsDescription), axis=2)