In [1]:
import pandas as pd

import math
import string
import numpy as np
from collections import Counter

from sklearn.utils import column_or_1d
from sklearn.preprocessing import LabelEncoder

# TF-IDF

In [2]:
def idf_val(s, tfidf):
#     values = list(tfidf.idf_)
    feature_names = list(tfidf.get_feature_names_out())
        
    if s in feature_names:
        index = feature_names.index(s)
    else:
        index = feature_names.index('unknown_tld')
        
    return tfidf.idf_[index]

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Lendo o Dataset
df = pd.read_csv('../Dataset/Train/Temp/pre_train.csv')

# Seleção da coluna para transformar de texto para número
train_texts = df['TLD']

# Excluindo valores NaN
train_texts.dropna(axis=0, how="any", inplace=True)

# Adding 'UNKNOWN' to match new words
train_texts = list(train_texts) + ['unknown_tld']

# Inicializa tfidf
vectorizer = TfidfVectorizer(analyzer="word", token_pattern=".*")

# treina tfidf
vectorizer.fit(train_texts)

TfidfVectorizer(token_pattern='.*')

In [4]:
from pickle import dump

# Saving the vectorizer
dump(vectorizer, open('../Models/TF-IDF/vectorizer.pikle', 'wb'))

In [5]:
train = pd.read_csv('../Dataset/Train/Temp/pre_train.csv')
train = train.fillna('')

test = pd.read_csv('../Dataset/Test/Temp/pre_test.csv')
test = test.fillna('')

In [6]:
train['TLD_TD_IDF'] = train['TLD'].apply(idf_val, tfidf=vectorizer)

In [7]:
from pickle import load

vectorizer = load(open('../Models/TF-IDF/vectorizer.pikle', 'rb'))

In [8]:
test['TLD_TD_IDF'] = test['TLD'].apply(idf_val, tfidf=vectorizer)

# Encode Train Dataset

In [9]:
class MyLabelEncoder(LabelEncoder):
    def fit(self, y):
        y = column_or_1d(y, warn=True)
        self.classes_ = pd.Series(y).unique()
        return self

def gen_encoder(lst):
    lst = sorted(Counter(lst), key=Counter(lst).get, reverse=True)

    unk_pos = len(lst)
    # unk_pos = math.ceil(len(lst)/2)        
    lst.insert(unk_pos, 'Unknown')
    
    label_encoder = MyLabelEncoder().fit(lst)    
    return label_encoder

def encode(item, encoder):
    if item not in list(encoder.classes_):
        item = 'Unknown'
    lst = [item]
    lst = encoder.transform(lst)
    return lst[0]

In [10]:
## Generating Encoders
unq_tld = list(train['TLD'].unique())
enc_tld = gen_encoder(unq_tld)

allowed_chars = list(string.digits)+list(string.ascii_lowercase)+['-', '.', '']
enc_chr = gen_encoder(allowed_chars)

## Saving Encoders
np.save('../Models/Encoders/enc_tld.npy', enc_tld.classes_)
np.save('../Models/Encoders/enc_chr.npy', enc_chr.classes_)

In [11]:
train['TLD'] = train['TLD'].apply(encode, encoder=enc_tld)

train['SSD_val_3'] = train['SSD_val_3'].apply(encode, encoder=enc_chr)
train['SUB_val_3'] = train['SUB_val_3'].apply(encode, encoder=enc_chr)
train['SLD_val_3'] = train['SLD_val_3'].apply(encode, encoder=enc_chr)

train['SSD_chr_seq_c'] = train['SSD_chr_seq_c'].apply(encode, encoder=enc_chr)
train['SUB_chr_seq_c'] = train['SUB_chr_seq_c'].apply(encode, encoder=enc_chr)
train['SLD_chr_seq_c'] = train['SLD_chr_seq_c'].apply(encode, encoder=enc_chr)

In [12]:
train.to_csv('../Dataset/Train/Temp/enc_train.csv', index=False)

# Encode Test Dataset

In [13]:
# Loading Encoders

enc_tld = LabelEncoder()
enc_tld.classes_ = np.load('../Models/Encoders/enc_tld.npy', allow_pickle=True)

enc_chr = LabelEncoder()
enc_chr.classes_ = np.load('../Models/Encoders/enc_chr.npy', allow_pickle=True)

In [14]:
test['TLD'] = test['TLD'].apply(encode, encoder=enc_tld)

test['SSD_val_3'] = test['SSD_val_3'].apply(encode, encoder=enc_chr)
test['SUB_val_3'] = test['SUB_val_3'].apply(encode, encoder=enc_chr)
test['SLD_val_3'] = test['SLD_val_3'].apply(encode, encoder=enc_chr)

test['SSD_chr_seq_c'] = test['SSD_chr_seq_c'].apply(encode, encoder=enc_chr)
test['SUB_chr_seq_c'] = test['SUB_chr_seq_c'].apply(encode, encoder=enc_chr)
test['SLD_chr_seq_c'] = test['SLD_chr_seq_c'].apply(encode, encoder=enc_chr)

In [15]:
test.to_csv('../Dataset/Test/Temp/enc_test.csv', index=False)