In [1]:
import pandas as pd
import numpy as np
import math
import json
import os

from huggingface_hub import hf_hub_download

import sklearn
import sklearn.svm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder

import torch
from torch.utils.data import DataLoader, TensorDataset

import matplotlib.pyplot as plt

import datasets

import joblib
import utilsLib as utils
from dotenv import load_dotenv

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_dat

In [2]:
load_dotenv()
HF_TOKEN = os.environ['HF_TOKEN']
DATASET_LINK = os.environ['DATASET_LINK']
np.random.seed(42)


In [3]:
# Load encodings
ENCODING_PATH = 'utils'
ENCODINGS_NAME = {
    "subcategory":"subcategoriesAliases.json",
    "category":"categoriesAliases.json",
    "label":"labelsAliases.json",
    "type":"typesAliases.json", 
}

# Ensemble Submodules paths
MODELS_PATH = "pretrained_models"
SVC_CATEGORIES_NAME = 'categorySVCWeights.pkl'
SVC_DESCRIPTION_NAME = 'DescriptionSVM.pkl'
SVM_PARAGRAPH_NAME = 'PenaltySVM_OOV.pkl'



In [4]:
encodings = dict()
for key in ENCODINGS_NAME:
    with open(f'{ENCODING_PATH}/{ENCODINGS_NAME[key]}', 'r') as file:
        encodings[key] = json.load(file)

In [5]:
datasetTotal = datasets.load_dataset(DATASET_LINK, token=HF_TOKEN)
datasetRaw = pd.read_csv('test_unlabeled.csv')

In [6]:
def baseInfoDataset(dataset, encodings, debug=False):
    subcategoriesColumn = np.zeros((dataset.shape[0], len(encodings['subcategory'].keys())), dtype=int)
    categoriesColumn = np.zeros((dataset.shape[0], len(encodings['category'].keys())), dtype=int)
    typesColumn = np.zeros((dataset.shape[0], len(encodings['type'].keys())), dtype=int)
    for i in range(dataset.shape[0]):
        try:
            subcategoriesColumn[i][encodings['subcategory'][dataset['subcategory'].iloc[i]]] = 1
        except KeyError:
            pass
        try:
            categoriesColumn[i][encodings['category'][dataset['category'].iloc[i]]] = 1
        except KeyError:
            pass
        try:
            typesColumn[i][encodings['type'][dataset['type'].iloc[i]]] = 1
        except KeyError:
            pass
    if debug:
        print(subcategoriesColumn.shape)
        print(categoriesColumn.shape)
        print(typesColumn.shape)
    
    return np.concatenate(
        (subcategoriesColumn,categoriesColumn,typesColumn),
        axis=1
    )

In [7]:
baseDataTarget = baseInfoDataset(datasetRaw, encodings)
baseDataTrain = baseInfoDataset(datasetTotal['train'].to_pandas(), encodings)
print(baseDataTarget.shape)
print(baseDataTrain.shape)

(300, 133)
(6251, 133)


In [None]:
paragraphCache = {'target': None, 'train': None, 'val':None}

In [None]:
# for i in ['target', 'train']:
#     with open(f'{i}Cache.json', 'w+', encoding='utf-8') as f:
#         json.dump(paragraphCache[i], f)
#         f.close()

In [None]:
for i in ['target', 'train', 'val']:
    with open(f'utils/{i}Cache.json', 'r', encoding='utf-8') as f:
        try:
            paragraphCache[i] = json.load(f)
        except json.JSONDecodeError:
            paragraphCache[i] = None
        f.close()

In [13]:
paragraphCache['target'] = None

In [14]:
def paragraphDataset(dataset, trainset, cache=paragraphCache):
    columsToEncode = ['type', 'subcategory']
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    target = dataset.copy()
    train = trainset.copy()
    trainEncodingTS = encoder.fit_transform(train[columsToEncode]).astype(float)
    targetEncodingTS = encoder.transform(target[['type', 'subcategory']]).astype(float)
   
    UNK_TOKEN = '<UNK>'
    processor = utils.ParagraphProcessor('utils/mostInfluentWords.json', 
                                            removed=['ˈ', 'also', 'since', 'many'],
                                            unkonwn=UNK_TOKEN)
   
    if cache['train'] is None:
        paragraphs = utils.getText(train, lang='en', max_workers=10)
        cache['train'] = paragraphs
    else:
        paragraphs = cache['train']
    
    train['paragraph'] = paragraphs
   
    if cache['target'] is None:
        paragraphs = utils.getText(target, lang='en', max_workers=10)
        cache['target'] = paragraphs
    else:
        paragraphs = cache['target']
    target['paragraph'] = paragraphs
    
    targetEncodingTFIDF, trainEncodingTFIDF = processor.process(target, train)
    
    targetData = np.hstack([targetEncodingTFIDF, targetEncodingTS])
    trainData = np.hstack([trainEncodingTFIDF, trainEncodingTS])
    return targetData, trainData

In [15]:
paragraphDataTarget, paragraphDataTrain = paragraphDataset(datasetRaw, datasetTotal['train'].to_pandas())
print(paragraphDataTarget.shape)
print(paragraphDataTrain.shape)

100%|██████████| 300/300 [00:47<00:00,  6.29it/s]
100%|[32m██████████[0m| 6251/6251 [00:06<00:00, 1035.84it/s]
100%|[32m██████████[0m| 300/300 [00:00<00:00, 700.33it/s] 


(300, 38056)
(6251, 38056)


In [16]:
def descriptionDataset(dataset, trainset, debug=False):
    UNK_TOKEN = '<UNK>'
    target = dataset.copy()
    train = trainset.copy()
    columsToEncode = ['type', 'subcategory']
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    
   
    trainEncodingTS = encoder.fit_transform(train[columsToEncode]).astype(float)
    targetEncodingTS = encoder.transform(target[['type', 'subcategory']]).astype(float)
    processor = utils.DescriptionProcessor(None, 
                                            removed=['ˈ', 'also', 'since', 'many'],
                                            unkonwn=UNK_TOKEN)
    
    targetEncodingTFIDF, trainEncodingTFIDF = processor.process(target,train)
    
    if debug:
        print(targetEncodingTFIDF.shape)
        print(targetEncodingTS.shape)
    
    targetData = np.hstack([targetEncodingTFIDF, targetEncodingTS])
    
    if debug:
        print(trainEncodingTFIDF.shape)
        print(trainEncodingTS.shape)
    trainData = np.concatenate((trainEncodingTFIDF, trainEncodingTS), axis=1)
    
    return targetData, trainData
    

In [17]:
descriptionDataTarget, descriptionDataTrain  = descriptionDataset(datasetRaw, datasetTotal['train'].to_pandas())
print(descriptionDataTarget.shape)
print(descriptionDataTrain.shape)

100%|[32m██████████[0m| 6251/6251 [00:01<00:00, 4753.11it/s]
100%|[32m██████████[0m| 300/300 [00:00<00:00, 2080.77it/s]


(300, 6642)
(6251, 6642)


In [18]:
labelsTrain = datasetTotal['train'].to_pandas()['label'].apply(lambda x: encodings['label'][x])
print(labelsTrain.shape)

(6251,)


In [None]:
if 'pretrained_models' not in os.listdir('.'):
    for i in [SVC_CATEGORIES_NAME, SVC_CATEGORIES_NAME, SVM_PARAGRAPH_NAME]:
        hf_hub_download(repo_id=os.environ['HF_REPO'],
                        filename=i, 
                        local_dir=MODELS_PATH, 
                        token=HF_TOKEN, 
                        repo_type='model')
else:
    for i in [SVC_CATEGORIES_NAME, SVC_CATEGORIES_NAME, SVM_PARAGRAPH_NAME]:
        if i not in os.listdir(MODELS_PATH):
            hf_hub_download(repo_id=os.environ['HF_REPO'],
                            filename=i, 
                            local_dir=MODELS_PATH, 
                            token=HF_TOKEN, 
                            repo_type='model')

In [19]:
svcBaseInfo = joblib.load(f"{MODELS_PATH}/{SVC_CATEGORIES_NAME}")
svcDescription = joblib.load(f"{MODELS_PATH}/{SVC_DESCRIPTION_NAME}")
svcParagraph = joblib.load(f"{MODELS_PATH}/{SVM_PARAGRAPH_NAME}")

In [20]:
encoding2label = {v:k for k,v in encodings['label'].items()}
print(encoding2label)

{0: 'cultural exclusive', 1: 'cultural agnostic', 2: 'cultural representative'}


In [21]:
predictionsBaseInfo = svcBaseInfo.predict_proba(baseDataTarget)
predictionsDescription = svcDescription.predict_proba(descriptionDataTarget)
predictionsParagraph = svcParagraph.predict_proba(paragraphDataTarget)

In [None]:
# TrainPredictionsB = svcBaseInfo.predict_proba(baseDataTrain)
# TrainPredictionsD = svcDescription.predict_proba(descriptionDataTrain)
# TrainPredictionsP = svcParagraph.predict_proba(paragraphDataTrain)

In [None]:
# np.concatenate((TrainPredictionsB, TrainPredictionsP, TrainPredictionsD), axis=1).dump('prep/sideData/trainX.npy')
# labelsTrain.to_numpy().dump('prep/sideData/trainY.npy')

In [22]:
probabilitiesFinal = np.concatenate((predictionsBaseInfo, predictionsParagraph, predictionsDescription), axis=1)

decisorInput = torch.Tensor(probabilitiesFinal).float()
print(decisorInput.shape)

torch.Size([300, 9])


In [23]:
class Decider(torch.nn.Module):
    def __init__(self, weightsPath=None|str):
        super().__init__()
        self.linear =torch.nn.Linear(9,3, dtype=torch.float32)
        if weightsPath is not None:
            self.linear.load_state_dict(torch.load(weightsPath))
        
    def forward(self,x):
        x = self.linear(x)     
        return x
    

In [24]:
decisor = Decider('pretrained_models/decisorWeights.pth')

In [25]:
decisor.eval()
predictions = decisor(decisorInput).softmax(1).argmax(1)

In [31]:
results = datasetRaw.copy()
literalResults = predictions.numpy()
results['label'] = pd.Series(predictions.numpy()).apply(lambda x: encoding2label[x])

In [32]:
results.to_csv('results.csv', index=False)