# Applied Analysis Assignment 2: PreProcessing

#### Enviroment: Python 3

## Import Libraries

In [1]:
# General
from pathlib import Path
import re
import pandas as pd
import numpy as np
import pickle
from time import perf_counter    # cell runtime
from natsort import natsorted    # intuitive sorting and renaming of variables
from itertools import chain      # used in concatenation function
# import winsound                  # play a sound (windows only)

# Tokenizing
import nltk.data                                                                    # punkt sentence tokenizer
from nltk.tokenize import PunktSentenceTokenizer, RegexpTokenizer, word_tokenize    # word tokenizers
from nltk.corpus import wordnet                                                     # parts of speech tagging
from nltk.stem import WordNetLemmatizer, PorterStemmer                              # lemming and stemming

# Stopword Removal
import string                        # list of punctuation
from nltk.corpus import stopwords    # use nltk short list of stopwords (unused)
from nltk.probability import *       # token frequency distributions (FreqDist())

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer 

In [2]:
# Start timer
t_start = perf_counter()

## User Defined Functions
See Tokenisation

In [3]:
def list_concat(a_list):
    '''
    concatenates the values in a list
    '''
    return list(chain.from_iterable(a_list))

## Load the Text Files into Newline separated Lists

In [4]:
# Training and Test Docs Location
target_folder = Path.home().joinpath('DropBox', 
                                     'Monash_Uni', 
                                     'FIT5149 Applied Data Analysis', 
                                     'Assessment 2')

In [5]:
# Load the training documents
target_file = target_folder.joinpath('training_docs.txt')

with open(target_file, mode='r', encoding='utf-8') as data_file:
    tr_docs = data_file.readlines()

In [6]:
# Load the training class labels
target_file = target_folder.joinpath('training_labels_final.txt')

with open(target_file, encoding='utf-8') as data_file:
    tr_labs = data_file.readlines()

In [7]:
# Load the test documents
target_file = target_folder.joinpath('testing_docs_shuffle.txt')

with open(target_file, encoding='utf-8') as data_file:
    test_docs = data_file.readlines()

## Create Document Dictionaries

In [8]:
# Train: Create a dictionary of document id : content text
tr_docs_dict = {}
for line in tr_docs:
    line = line.strip()
    if (line != '' and line.lower() != 'eod'):            # exclude empty and end-of-document lines
        if bool(re.match(r'^(ID)', line)):                # identify lines starting with 'ID'
            id_k = re.findall(r'(?<=ID ).*$', line)[0]    # get the document id
            text = []                                     # empty list to hold document contents
        else:
            t = re.findall(r'(?<=TEXT ).*$', line)[0]     # get the document contents past the 'TEXT' signifier
            text.append(t.lower())                        # normalise to lowercase
    tr_docs_dict[id_k] = text                             # create a dictionary of document id key : document content list value

In [9]:
# Train: Create a dictionary of document id : class label
tr_labs_dict = {}
for line in tr_labs:
    line = line.strip()
    id_k = re.findall('(?:\S+)+', line)    # list the terms
    tr_labs_dict[id_k[0]] = id_k[1]        # create dictionary of document id : class label

In [10]:
# Test: Create a dictionary of document id : content text
test_docs_dict = {}
for line in test_docs:
    line = line.strip()
    if (line != '' and line.lower() != 'eod'):            # exclude empty and end-of-document lines
        if bool(re.match(r'^(ID)', line)):                # identify lines starting with 'ID'
            id_k = re.findall(r'(?<=ID ).*$', line)[0]    # get the document id
            text = []                                     # empty list to hold document contents
        else:
            t = re.findall(r'(?<=TEXT ).*$', line)[0]     # get the document contents past the 'TEXT' signifier
            text.append(t.lower())                        # normalise to lowercase
    test_docs_dict[id_k] = text                             # create a dictionary of document id key : document content list value

## Create Dataframes

#### Testing Set

In [11]:
test_df = pd.DataFrame.from_dict(test_docs_dict, orient='index', columns=['Contents'])
test_df.reset_index(inplace=True)
test_df.rename(columns = {'index':'Doc'}, inplace=True) 

In [12]:
test_df.tail()

Unnamed: 0,Doc,Contents
26605,te_doc_26606,the queensland resources council (qrc) says th...
26606,te_doc_26607,at least 60 people have died and 4 million hav...
26607,te_doc_26608,a campaign to put an end to tailgating by truc...
26608,te_doc_26609,there's been more grim reading on the state of...
26609,te_doc_26610,an explosion at the indonesian embassy in pari...


### Training Set

In [13]:
# Training labels dataframe
tr_labs_df = pd.DataFrame.from_dict(tr_labs_dict, orient='index', columns=['Class'])

# Training contents dataframe
tr_docs_df = pd.DataFrame.from_dict(tr_docs_dict, orient='index', columns=['Contents'])

In [14]:
# Combined dataframe
tr_df = pd.concat([tr_labs_df, tr_docs_df], axis=1).reset_index()

In [15]:
# Add category column
tr_df['Category'] = tr_df['Class'].apply(lambda x: int(x[1:]))

In [16]:
# Rename and Rearrange the columns
tr_df.rename(columns = {'index':'Doc'}, inplace=True)   
cols = tr_df.columns.tolist()
cols.insert(1, cols.pop(cols.index('Category')))
tr_df = tr_df.reindex(columns= cols)

In [17]:
tr_df.tail()

Unnamed: 0,Doc,Category,Class,Contents
106440,tr_doc_106441,23,C23,japan's new ambassador to china has urged stro...
106441,tr_doc_106442,23,C23,a man in northern china has driven a car carry...
106442,tr_doc_106443,23,C23,chinese police have rescued 89 children and ar...
106443,tr_doc_106444,23,C23,turkish energy minister suggests 'far eastern ...
106444,tr_doc_106445,23,C23,huey fern tay looks back on some of the news h...


In [18]:
# number of training documents (needed later for getting percentage of removed empty docs)
num_tr_docs = tr_df.shape[0]

## Tokenize the Training Texts

In [21]:
# Punkt sentence tokeniser
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# sent_tokenizer = PunktSentenceTokenizer()

In [22]:
# Function to convert POS tags to wordnet tags
# The code in this cell is adapted from the following website
# http://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [23]:
# WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

In [24]:
# Aggregate tokenisation function
def bespoke_tokenizer(text):
    # identify sentences
    sents = sent_tokenizer.tokenize(text.strip())
    # tokenize the sentences
    toks = [word_tokenize(i) for i in sents] 
    # tag parts of speech
    tb_pos = list_concat([nltk.pos_tag(j) for j in toks])
    # convert treebank tags to wordnet tags
    wd_pos  = [(pair[0], get_wordnet_pos(pair[1])) for pair in tb_pos]
    # derive lemmas
    lems = [(lemmatizer.lemmatize(pair[0]), pair[1]) for pair in wd_pos]
    # return tokens
    return [k[0] for k in lems]

In [25]:
# add empty column to dataframes
tr_df['Tokens'] = ''
test_df['Tokens'] = ''

In [26]:
# Function to run the tokenizer and print out run checks
def run_tokenizer(df):
    # get run time
    t0 = perf_counter()  
    # tokenize
    for item in df.itertuples():
        df.at[item.Index, 'Tokens'] = bespoke_tokenizer(item.Contents)
        if (item[0] % 1000 == 0):
            t1 = perf_counter()
            elapsed_time = t1 - t0   
            print('Still running! It has been %.1f mins' % ((elapsed_time)/60))
    t1 = perf_counter()
    elapsed_time = t1 - t0   
    print('Total Elapsed time: %.1f mins' % ((elapsed_time)/60))

In [27]:
# tokenize the dataframe - only run this to recreate the pickle file if needed -  warning: long run time
# run_tokenizer(tr_df)      3 ~204 min
# run_tokenizer(test_df)    # ~50 min

In [28]:
# This is the better way to run the tokenizer but doesn't include print statements of periodioc runtime
# %%time
# tr_df['Tokens'] = tr_df['Contents'].apply(bespoke_tokenizer)
# test_df['Tokens'] = test_df['Contents'].apply(bespoke_tokenizer)

In [30]:
# save cleaned and tokenized dataframes to pickle (since running the tokenization takes so long)
# tr_df.to_pickle('tokenised_tr_df.pkl')
# test_df.to_pickle('tokenised_test_df.pkl')

# read the pickled dataframes
tr_df = pd.read_pickle(target_folder.joinpath('tokenised_tr_df.pkl'))
test_df = pd.read_pickle(target_folder.joinpath('tokenised_test_df.pkl'))

In [31]:
test_df.head()

Unnamed: 0,Doc,Contents,Tokens
0,te_doc_1,mahela jayawardene's sri lanka will leapfrog f...,"[mahela, jayawardene, 's, sri, lanka, will, le..."
1,te_doc_2,floodwaters from queensland's central west hav...,"[floodwaters, from, queensland, 's, central, w..."
2,te_doc_3,a circus trainer is fighting for his life afte...,"[a, circus, trainer, is, fighting, for, his, l..."
3,te_doc_4,"forget wimbledon for now, fed cup captain davi...","[forget, wimbledon, for, now, ,, fed, cup, cap..."
4,te_doc_5,two women have been charged after an investiga...,"[two, woman, have, been, charged, after, an, i..."


In [32]:
tr_df.head()

Unnamed: 0,Doc,Category,Class,Contents,Tokens
0,tr_doc_1,1,C1,two german tourists have been found safe and w...,"[two, german, tourist, have, been, found, safe..."
1,tr_doc_2,1,C1,act police have seized a rare drug during a ra...,"[act, police, have, seized, a, rare, drug, dur..."
2,tr_doc_3,1,C1,a 50-year-old brisbane man has been charged wi...,"[a, 50-year-old, brisbane, man, ha, been, char..."
3,tr_doc_4,1,C1,in-depth discussions are continuing to resolve...,"[in-depth, discussion, are, continuing, to, re..."
4,tr_doc_5,1,C1,homicide detectives are still questioning a ma...,"[homicide, detective, are, still, questioning,..."


## Remove Stopwords, Hapaxes, and Empty Documents

#### Create Set of Freestanding Punctuation, Contractions, and Common Words

In [33]:
# Define list of punctuation to remove
punct = set(string.punctuation)
punct.update(["''","``","..."])

In [35]:
# Get set of very common stopwords
# csw = set(stopwords.words("english"))    # nltk's list of stopwords (short)
csw = []
with open(target_folder.joinpath('stopwords_en.txt')) as f:
    csw = set(f.read().splitlines()) # list of stopwords (long)

In [36]:
# Create set of contraction words
target_file = target_folder.joinpath('wiki_list_contractions.txt')

with open(target_file, encoding='utf-8') as data_file:
    wlc = data_file.read()

tokenizer = RegexpTokenizer(r"\s+", gaps=True) # (\s means any whitespace character (\t\n\r\f\v))
contractions = set(tokenizer.tokenize(wlc))

In [None]:
# Get set of tokens in the test dataset taht are not in the training dataset
all_tr_tokens = set(list_concat(tr_df['Tokens'].tolist()))
all_test_tokens = set(list_concat(test_df['Tokens'].tolist()))
diff = all_test_tokens.difference(all_tr_tokens)

In [None]:
# Additional stopwords
other = {'ha', 'wa'}    # lemmatizer converts 'has' to 'ha' and 'was' to 'wa'

In [None]:
# Complete set of stopwords
stopwords_1 = csw.union(contractions, punct, diff, other)

#### Remove Stopwords

In [None]:
# Function to keep an item in a list if it is not found in a reference
def remove_stopwords(text, stopwords):
    return [token for token in text if token not in stopwords]    # list and set

In [None]:
%%time
# Remove stopwords
tr_df['Tokens'] = tr_df['Tokens'].apply(remove_stopwords, args=(stopwords_1,))
test_df['Tokens'] = test_df['Tokens'].apply(remove_stopwords, args=(stopwords_1,))

#### Stemming
* stem to target all the words missed by the lemmatizer 
* even if the stemmed word is wrong (ex. bushfires and bushfire $\rightarrow$ bushfir) it shouldn't matter as long as it's consistent

In [None]:
# Define stemmer
ps = PorterStemmer()

In [None]:
%%time
# Stem
tr_df['Tokens'] = tr_df['Tokens'].apply(lambda x : [ps.stem(y) for y in x])
test_df['Tokens'] = test_df['Tokens'].apply(lambda x : [ps.stem(y) for y in x])

#### Remove Hapaxes
Reduces vectorization time significantly

In [None]:
# Create list of all tokens in the corpus
all_words = list_concat(tr_df['Tokens'])

In [None]:
# Get frequency distribution of tokens
freq_dist = FreqDist(all_words)

In [None]:
# Get hapaxes
haps = set(freq_dist.hapaxes())

In [None]:
# Remove hapaxes
tr_df['Tokens'] = tr_df['Tokens'].apply(remove_stopwords, args=(haps,))
test_df['Tokens'] = test_df['Tokens'].apply(remove_stopwords, args=(haps,))

In [None]:
len(haps)

#### Remove Empty Docs from the Training Set

In [None]:
# Remove docs with k tokens
k = 0
tr_df = tr_df[tr_df['Tokens'].map(lambda d: len(d)) > 0]

In [None]:
# Percentage of docs removed
'{}%'.format(round(100*(num_tr_docs-tr_df.shape[0])/num_tr_docs, 3))

In [None]:
# Set of removed indices
rm_inds = set(range(0,num_tr_docs-1)) - set(tuple(tr_df.index))

## Concatenate Tokens 
In preparation to feed to the tf-idf vectorizer

In [None]:
# Concatenate the tokens for each document
tr_df['Final_Tokens'] = tr_df['Tokens'].str.join(' ')
test_df['Final_Tokens'] = test_df['Tokens'].str.join(' ')

In [None]:
tr_df.head(3)

## Vectorize the Tokenized Texts

In [None]:
# Define the vectorizer model
vectorizer = TfidfVectorizer()    # min_df=200, ngram_range=(1, 2))

In [None]:
# Define the corpus to be vectorized
tar_col = 'Final_Tokens'    # use cleaned corpus
# tar_col = 'Contents'       # use raw corpus

In [None]:
# Define testing and training sets
X_train = tr_df[tar_col]       # train text to vectorize
y_train = tr_df['Category']    # train target labels
X_test = test_df[tar_col]      # test text to vectorize

In [None]:
%%time
# Create feature vectors
vect = vectorizer.fit(X_train)             # learn the vocabulary and fit idfs
X_train_vects = vect.transform(X_train)    # transform the training documents into a document-term matrix
X_test_vects = vect.transform(X_test)      # transform the test documents into a document-term matrix

In [None]:
print(X_test_vects.shape, type(X_test_vects))

## Identify Most Correlated Features by Class

In [None]:
from sklearn.feature_selection import chi2

In [None]:
# Create dictionary of class : category 
label_to_categ = dict(tr_df[['Class', 'Category']].drop_duplicates().sort_values('Category').values)
# Create datarame to hold the correlation results
corr_features = pd.DataFrame()

In [None]:
# Correlation score for each feature by class
for class_label, category in label_to_categ.items():
    chi2score = chi2(X_train_vects, y_train == category)[0]     # get correlation scores
    wscores = zip(vectorizer.get_feature_names(),chi2score)     # list scores with feature name, instead of feature number
    wchi2 = sorted(wscores, key=lambda x:x[1], reverse=True)    # sort the scores in descending order
    corr_features[class_label] = list(zip(*wchi2))[0]           # add the sorted list to the dataframe

In [None]:
# Top N correlated features by class
n = 200
corr_head = corr_features.head(n)

In [None]:
# View most correlated features by class
display(corr_head.loc[:, 'C1':'C11'].head(5))
display(corr_head.loc[:, 'C12':].head(5))

## Keep Top-Correlated Features Only

In [None]:
# Set of all the top n correlated features
top_corr_dict = {c: corr_head[c] for c in corr_head}
top_n = set(list_concat(top_corr_dict.values()))

In [None]:
# Function to keep only items found in a reference
def keepwords(text, list_of_words):
    return [token for token in text if token in list_of_words]

In [None]:
# Remove features not in the top correlation set
tr_df['Tokens'] = tr_df['Tokens'].apply(keepwords, args=(top_n,))
test_df['Tokens'] = test_df['Tokens'].apply(keepwords, args=(top_n,))

In [None]:
tr_df['Final_Tokens'] = tr_df['Tokens'].str.join(' ')
test_df['Final_Tokens'] = test_df['Tokens'].str.join(' ')

In [None]:
tr_df.head(3)

In [None]:
# Remove docs with k tokens
k = 0
tr_df = tr_df[tr_df['Tokens'].map(lambda d: len(d)) > 0]

In [None]:
# Percentage of docs removed
'{}%'.format(round(100*(num_tr_docs-tr_df.shape[0])/num_tr_docs, 3))

In [None]:
# Set of removed indices
rm_inds = set(range(0,num_tr_docs-1)) - set(tuple(tr_df.index))

In [None]:
# Get pandas series of training indices for mapping after re-vectorizaton
tr_df.reset_index(inplace=True, drop=True)
tr_index = pd.Series(tr_df['Category'])

## Re-Vectorize

In [None]:
# Define testing and training sets
X_train = tr_df[tar_col]       # train text to vectorize
y_train = tr_df['Category']    # train target labels
X_test = test_df[tar_col]      # test text to vectorize

In [None]:
%%time
# Create feature vectors
vect = vectorizer.fit(X_train)             # learn the vocabulary and fit idfs
X_train_vects = vect.transform(X_train)    # transform the training documents into a document-term matrix
X_test_vects = vect.transform(X_test)      # transform the test documents into a document-term matrix

In [None]:
print(X_train_vects.shape, type(X_train_vects))
print(X_test_vects.shape, type(X_test_vects))

## PCA Dimension Reduction

#### Scale
(Don't scale tfidf vectors. they are already scaled.)  
Scale the feature vectors to mean=0 and unit variance  

In [None]:
# convert the scipy sparse matrices to sparse dataframes 
# note: this produces NaNs instead of zeroes, which the scaler below doesn't like
# train_ftrs = pd.SparseDataFrame(X_train_vects)
# test_ftrs = pd.SparseDataFrame(X_test_vects)

# convert the scipy sparse matrices to dense dataframes
train_temp = pd.DataFrame(X_train_vects.todense())
test_temp = pd.DataFrame(X_test_vects.todense())

In [None]:
print(train_temp.shape)
print(test_temp.shape)

In [None]:
# from sklearn.preprocessing import StandardScaler

In [None]:
# Define the scaler (mean of 0, variance of 1)
# scaler = StandardScaler()

In [None]:
# Fit scaler on training set
# scaler.fit(train_temp)

In [None]:
# # Scale both the training set and test sets
# train_scaled = scaler.transform(train_temp)
# test_scaled = scaler.transform(test_temp)

#### Apply PCA 

In [None]:
# from sklearn.decomposition import PCA

In [None]:
# Set the retained variance (PCA(.95): 95% of the variance is retained)
# pca = PCA(.95)

In [None]:
# %%time
# Fit PCA on training set
# pca.fit(train_ftrs)

In [None]:
# View the number of principal compenent vectors (new number of features)
# print('number of pca features: ', pca.n_components_)
# print('feature reduction of {:.2%}.'.format(pca.n_components_ / train_temp.shape[1]))

In [None]:
# %%time
# # Apply PCA to both the training and test sets
# train_ftrs = pca.transform(train_ftrs)
# test_ftrs = pca.transform(test_ftrs)

In [None]:
# pca.explained_variance_ratio_

# Modelling

In [None]:
%%time
# Note: by default, cross_val_score calls either KFold or StratifiedKFold, both of which have shuffle=False,
# so the folds are not random. Since my data is arranged by category, I was trying to predict a lot of classes that
# the training set had not seen before, and getting worse results. Using ShuffleSplit remedies this by providing
# a truly random set.

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

models = [
#     RandomForestClassifier(n_estimators=100, random_state=0),
#     LinearSVC(),
#     MultinomialNB(),
    LogisticRegression(random_state=0, solver = 'lbfgs')    # default solver is incredibly slow which is why it was changed to 'lbfgs'
]

n_cv = 5
CV = ShuffleSplit(n_splits=n_cv, test_size=0.3, random_state=0)
cv_df = pd.DataFrame(index=range(n_cv * len(models)))
entries = []

for m in models:
    model_name = m.__class__.__name__
#     accuracies = cross_val_score(m, train_ftrs, y_train, scoring='accuracy', cv=CV)
    accuracies = cross_val_score(m, X_train_vects, y_train, scoring='accuracy', cv=CV)
    for fold_idx, value1 in enumerate(accuracies):
        entries.append((model_name, fold_idx, value1))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
# Training accuracy by model
print('top', n, 'features, total features =', X_train_vects.shape)
cv_df.groupby('model_name').accuracy.mean()

In [None]:
# play sound
# winsound.PlaySound('CRCHBELL.WAV', winsound.SND_ASYNC)

In [None]:
t_end = perf_counter()
elapsed_time = t_end - t_start
print('Total Elapsed time: %.1f mins' % ((elapsed_time)/60))

## Save Feature Vectors to File

In [None]:
train_ftrs = pd.concat([tr_df['Category'], train_temp], axis = 1)
test_ftrs = pd.concat([test_df['Doc'], test_temp], axis = 1)

In [None]:
train_ftrs.to_csv('cleaned_training_vectors.csv')
test_ftrs.to_csv('cleaned_testing_vectors.csv')

In [None]:
t_end = perf_counter()
elapsed_time = t_end - t_start
print('Total Elapsed time: %.1f mins' % ((elapsed_time)/60))

In [None]:
# play sound
# winsound.PlaySound('CRCHBELL.WAV', winsound.SND_ASYNC)

## Convert Feature Vector Matrices to Dataframes
(this turned out to be a really inefficient way of doing this. pd.DataFrame(..todense()) is much faster.  
also, .tocoo() or .row() removed empty rows, which I didn't want.)  
in prep for saving to files

In [None]:
# Convert scipy sparse matrix from compressed row format to coordinate format
# train_spm = X_train_vects.tocoo()
# test_spm = X_test_vects.tocoo()

In [None]:
# Create feature dataframes
# train_ftrs = pd.DataFrame({'Doc':train_spm.row, 'Vocab':train_spm.col, 'TFIDF':train_spm.data, 'Category':0})
# test_ftrs = pd.DataFrame({'Doc':test_spm.row, 'Vocab':test_spm.col, 'TFIDF':test_spm.data, 'DocName':''})

In [None]:
# %%time
# Associate the training features with the corresponding category value
# for grp, rows in train_ftrs.groupby('Doc'):
#     train_ftrs.loc[rows.index, 'Category'] = tr_df.loc[grp,'Category']

In [None]:
# %%time
# Associate the testing features with the corresponding document name 
# for grp, rows in test_ftrs.groupby('Doc'):    
#     test_ftrs.loc[rows.index, 'DocName'] = test_df.loc[grp,'Doc']
    
# # play sound
# winsound.PlaySound('CRCHBELL.WAV', winsound.SND_ASYNC)

In [None]:
# add plus one to the Doc values to reflect the document names starting at 1 ('tr/te_doc_1')
# train_ftrs['Doc'] = train_ftrs['Doc'].add(1)
# test_ftrs['Doc'] = test_ftrs['Doc'].add(1)

In [None]:
# Train: Get random rows and check that the category is correct for doc in both dataframes
# tr_rand = train_ftrs.sample(n=3)
# l = list(tr_rand['Doc'].apply(lambda x: "{}{}".format('tr_doc_', str(x))))
# print('tr_df')
# display(tr_df.query('Doc == @l'))
# print('train_ftrs')
# display(tr_rand.sort_index())

In [None]:
# Test: Get random rows and check that doc and document name match
# test_rand = test_ftrs.sample(n=3)
# l = list(test_rand['Doc'].apply(lambda x: "{}{}".format('te_doc_', str(x))))
# print('test_df')
# display(test_df.query('Doc == @l'))
# print('test_ftrs')
# display(test_rand.sort_index())