### Importing Libraries

In [None]:
import pandas as pd


from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DataCollatorWithPadding
import numpy as np


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Step 1: Data Preparation

### Reading Csv File

In [None]:

df=pd.read_csv("dataset2.csv")
df.rename(columns = {'UserID':'label',}, inplace = True)

### Checking if the reading is complete

In [None]:
df.head()

Unnamed: 0,label,Gender,Age,Height,Weight,ActivityLevel,GoalID,DietPreferenceID,RegionID,AllergenID
0,64759,Male,63,169,64,LightlyActive,3,2,5,7
1,22456,Other,35,155,60,Sedentary,9,5,10,1
2,63401,Other,19,156,62,Sedentary,5,4,6,4
3,72903,Male,24,151,75,LightlyActive,6,4,6,3
4,76338,Female,33,166,72,ModeratelyActive,10,1,2,2


### Combining Columns to Text for NLP models

In [None]:
def combine_columns_to_text(df, exclude_columns=None, column_weights=None):
    if exclude_columns is None:
        exclude_columns = []
    if column_weights is None:
        column_weights = {}
    if df.empty:
        return pd.DataFrame(columns=['text', 'label'])

    # Store the 'label' column if it exists and is not in the exclude list
    label_col = df['label'] if 'label' in df.columns and 'label' not in exclude_columns else None

    # Drop the columns that need to be excluded
    df_combined = df.drop(columns=exclude_columns, errors='ignore')

    # Apply weighting to specified columns by repeating their contents along with the column name
    for column, weight in column_weights.items():
        if column in df_combined.columns:
            df_combined[column] = df_combined[column].apply(
                lambda x: (' '.join([f"{column}:{x}"] * weight)) if pd.notnull(x) else ''
            )

    # Combine all columns into a single column
    df_combined['text'] = df_combined.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)

    # Include the 'label' column in the result if it was stored
    if label_col is not None:
        df_combined['label'] = label_col

    # Return the DataFrame with the 'text' and 'label' columns
    return df_combined[['text', 'label']] if label_col is not None else df_combined[['text']]


### Simpler version of combining for later evaluation of text similarity


In [None]:
def combine_columns_to_text_simple(df, exclude_columns=None):
    if exclude_columns is None:
        exclude_columns = []
    if df.empty:
        return pd.DataFrame(columns=['text', 'label'])

    # Store the 'label' column if it exists and is not in the exclude list
    label_col = df['label'] if 'label' in df.columns and 'label' not in exclude_columns else None

    # Drop the columns that need to be excluded
    df_combined = df.drop(columns=exclude_columns, errors='ignore')

    # Combine column names and values into a single column by concatenating the text from each column with its column name
    df_combined['text'] = df_combined.apply(lambda row: ' '.join([f"{col}:{val}" for col, val in row.dropna().items()]), axis=1)

    # Include the 'label' column in the result if it was stored
    if label_col is not None:
        df_combined['label'] = label_col

    # Return the DataFrame with the 'text' and 'label' columns
    return df_combined[['text', 'label']] if label_col is not None else df_combined[['text']]


### Adjusting importance/weights of the parameters in te text
- by adding more textual signifigance to the data for it

In [None]:
# Define weights for each column
column_weights = {
    'Username': 0,
    'Email': 0,
    'Gender': 0,
    'Age': 0,
    'Height': 0,
    'Weight': 0,
    'ActivityLevel': 15,
    'GoalID': 10,
    'DietPreferenceID': 15,
    'RegionID': 15,
    'AllergenID': 30  # Higher weight for AllergenID
}

# Combine columns with specified weights
generalDataframe = combine_columns_to_text(df, exclude_columns=['DietID'], column_weights=column_weights)
generalDataframe2=combine_columns_to_text_simple(df,exclude_columns=['DietID','Email','Username'])


### Checking if the format is correct

In [None]:
generalDataframe.head()

Unnamed: 0,text,label
0,64759 ActivityLevel:LightlyActive Activity...,64759
1,22456 ActivityLevel:Sedentary ActivityLeve...,22456
2,63401 ActivityLevel:Sedentary ActivityLeve...,63401
3,72903 ActivityLevel:LightlyActive Activity...,72903
4,76338 ActivityLevel:ModeratelyActive Activ...,76338


In [None]:
generalDataframe2.head()

Unnamed: 0,text,label
0,label:64759 Gender:Male Age:63 Height:169 Weig...,64759
1,label:22456 Gender:Other Age:35 Height:155 Wei...,22456
2,label:63401 Gender:Other Age:19 Height:156 Wei...,63401
3,label:72903 Gender:Male Age:24 Height:151 Weig...,72903
4,label:76338 Gender:Female Age:33 Height:166 We...,76338


# Step 2: Training DistilBert Model

### Encoding Labels

In [None]:

# Encode labels
label_encoder = LabelEncoder()
generalDataframe['label'] = label_encoder.fit_transform(generalDataframe['label'])

### Train Test Split of Data for Distilbert

In [None]:
# Split dataset
train_df, val_df = train_test_split(generalDataframe, test_size=0.001, random_state=42)


In [None]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


### Importing tokenizer

In [None]:

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Preprocess function to map label encodings to text tokens

In [None]:

def preprocess_function(examples):
    return tokenizer(examples["text"])

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/4995 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

### Trainer Preparation

In [None]:
numberOfClasses=max(train_df['label'])

In [None]:
myDataset=Dataset.from_pandas(train_df)

In [None]:
class ConfiguredMetric:
    def __init__(self, metric, *metric_args, **metric_kwargs):
        self.metric = metric
        self.metric_args = metric_args
        self.metric_kwargs = metric_kwargs

    def add(self, *args, **kwargs):
        return self.metric.add(*args, **kwargs)

    def add_batch(self, *args, **kwargs):
        return self.metric.add_batch(*args, **kwargs)

    def compute(self, *args, **kwargs):
        return self.metric.compute(*args, *self.metric_args, **kwargs, **self.metric_kwargs)

    @property
    def name(self):
        return self.metric.name

    def _feature_names(self):
        return self.metric._feature_names()

### Loading DistilbertModel

In [None]:
modelBert= AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=numberOfClasses+1)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Padding for the dynamic range of text lengths

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
!pip install transformers[torch]
!pip install accelerate -U



In [None]:
!pip install transformers[torch]
!pip install accelerate -U



### Trainer

In [None]:

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=modelBert,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Training
trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,8.5282


TrainOutput(global_step=939, training_loss=8.52417642189164, metrics={'train_runtime': 675.6572, 'train_samples_per_second': 22.178, 'train_steps_per_second': 1.39, 'total_flos': 2027328280648080.0, 'train_loss': 8.52417642189164, 'epoch': 3.0})

### Evaluation of Distilbert Model

In [None]:

# Evaluation
eval_results = trainer.evaluate(tokenized_val_dataset)
print(eval_results)


{'eval_loss': 8.553973197937012, 'eval_runtime': 0.0736, 'eval_samples_per_second': 67.954, 'eval_steps_per_second': 13.591, 'epoch': 3.0}


# Step 3: Machine Learning Models

### Train Test Split for Machine Learning Models


In [None]:
X_train, X_test, y_train, y_test = train_test_split(generalDataframe['text'], generalDataframe['label'], random_state=123)


### TF-IDF Vectorizing

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', strip_accents='unicode', lowercase=True)
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)


### Model Training and Predictions

In [None]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train_transformed, X_train)
svm_predictions = svm_model_linear.predict(X_test_transformed)

In [None]:
modelNb=MultinomialNB()
modelNb.fit(X_train_transformed,X_train)
predictions=modelNb.predict(X_test_transformed)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 12).fit(X_train_transformed, X_train)
knn_predictions = knn.predict(X_test_transformed)

# Step 4: Testing

In [None]:
text_to_predict = """
Gender: Male Age: 94 Height: 170 Weight: 78 ActivityLevel: ModeratelyActive GoalID: 9 DietPreferenceID: 2 RegionID: 10 AllergenID: 10"""

### Predicting Functions


In [None]:
import torch
def predict_labelDistilbert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    inputs = {k: v.to(modelBert.device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = modelBert(**inputs).logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_label = torch.argmax(probabilities, dim=-1).item()
    return label_encoder.inverse_transform([predicted_label])[0]

def predict_label_id_multinomial_nb(text):
    text_transformed = vectorizer.transform([text])
    return modelNb.predict(text_transformed)[0]

def predict_label_id_svm(text):
    text_transformed = vectorizer.transform([text])
    return svm_model_linear.predict(text_transformed)[0]

def predict_label_id_knn(text):
    text_transformed = vectorizer.transform([text])
    return knn.predict(text_transformed)[0]


### Predictions

In [None]:
predicted_labelDISTILBERT=""
predicted_labelDISTILBERT = predict_labelDistilbert(text_to_predict)

In [None]:
predicted_labelMNB=""
predicted_labelMNB= predict_label_id_multinomial_nb(text_to_predict)
print(f"Multinomial Naive Bayes predicted label ID: {predicted_labelMNB}")


Multinomial Naive Bayes predicted label ID: 26424     ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 DietPreferenceID:4 RegionID:10 RegionID:10 RegionID:10 RegionID:10 RegionID:10 RegionID:10 RegionID:10 RegionID:10 

In [None]:

predicted_labelSVM=""
predicted_labelSVM = predict_label_id_svm(text_to_predict)
print(f"SVM predicted label ID: {predicted_labelSVM}")


SVM predicted label ID: 74721     ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 Region

In [None]:
# Predict using KNN
predicted_labelKNN = predict_label_id_knn(text_to_predict)
print(f"KNN predicted label ID: {predicted_labelKNN}")

KNN predicted label ID: 10624     ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive ActivityLevel:ModeratelyActive GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 GoalID:10 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 DietPreferenceID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 RegionID:5 Region

### Label Extraction for Machine Learning Models

In [None]:
import re

# Example predicted label strings for SVM, KNN, and MNB models
predictedLabelSVM = predicted_labelSVM
predictedLabelKNN = predicted_labelKNN
predictedLabelMNB = predicted_labelMNB

# Function to extract the leading integer from a predicted label string
def extract_leading_integer_from_label(predicted_label):
    match = re.match(r'(\d+)', predicted_label)
    if match:
        return int(match.group(1))  # Convert the matched digits to an integer
    else:
        return None  # Return None if no leading integer is found

# Extract leading integers for each predicted label
leading_integer_svm = extract_leading_integer_from_label(predictedLabelSVM)
leading_integer_knn = extract_leading_integer_from_label(predictedLabelKNN)
leading_integer_mnb = extract_leading_integer_from_label(predictedLabelMNB)

### Label Visualization

In [None]:

# Create a dictionary with model names as keys and their respective values
model_values_dict = {
    "DistilBERT": predicted_labelDISTILBERT,  # This is the predicted label for DistilBERT
    "SVM": leading_integer_svm,  # This is the leading integer extracted from the SVM predicted label
    "KNN": leading_integer_knn,  # This is the leading integer extracted from the KNN predicted label
    "MNB": leading_integer_mnb   # This is the leading integer extracted from the MNB predicted label
}

# Print the dictionary to verify its contents
for model, value in model_values_dict.items():
    print(f"{model}: {value}")


DistilBERT: 26253
SVM: 74721
KNN: 10624
MNB: 26424


In [None]:
# Define variables to hold the label values you want to filter by
filter_label_value_svm = leading_integer_svm  # Replace with the actual value for SVM
filter_label_value_distilbert = predicted_labelDISTILBERT  # Replace with the actual value for DistilBERT
filter_label_value_mnb = leading_integer_mnb  # Replace with the actual value for MNB
filter_label_value_knn = leading_integer_knn  # Replace with the actual value for KNN

### Extracting relevant part of the text information

In [None]:
# Define a function to extract the text starting from "Gender"
def extract_text_from_gender(text):
    parts = text.split("Gender:")  # Split the text at "Gender:"
    if len(parts) > 1:
        return "Gender:" + parts[1]  # Return the part of the text starting from "Gender:"
    else:
        return text  # Return the original text if "Gender:" is not found


### Filtering our Data to Find Corresponding Information To Labels

In [None]:
# Filter the DataFrame for each predicted label and retrieve the corresponding 'text' value

# For predicted_labelSVM
filtered_df_svm = generalDataframe2.loc[generalDataframe2['label'] == filter_label_value_svm, 'text']
corresponding_text_value_svm = extract_text_from_gender(filtered_df_svm.iloc[0]) if not filtered_df_svm.empty else "No matching text found for the specified label"

# For predicted_labelDISTILBERT
filtered_df_distilbert = generalDataframe2.loc[generalDataframe2['label'] == filter_label_value_distilbert, 'text']
corresponding_text_value_distilbert = extract_text_from_gender(filtered_df_distilbert.iloc[0]) if not filtered_df_distilbert.empty else "No matching text found for the specified label"

# For predicted_labelMNB
filtered_df_mnb = generalDataframe2.loc[generalDataframe2['label'] == filter_label_value_mnb, 'text']
corresponding_text_value_mnb = extract_text_from_gender(filtered_df_mnb.iloc[0]) if not filtered_df_mnb.empty else "No matching text found for the specified label"

# For predicted_labelKNN
filtered_df_knn = generalDataframe2.loc[generalDataframe2['label'] == filter_label_value_knn, 'text']
corresponding_text_value_knn = extract_text_from_gender(filtered_df_knn.iloc[0]) if not filtered_df_knn.empty else "No matching text found for the specified label"

# Print or use the corresponding text values as needed
print("SVM:", corresponding_text_value_svm)
print("DistilBERT:", corresponding_text_value_distilbert)
print("MNB:", corresponding_text_value_mnb)
print("KNN:", corresponding_text_value_knn)
print("Actual:", extract_text_from_gender(text_to_predict))

SVM: Gender:Female Age:47 Height:180 Weight:91 ActivityLevel:ModeratelyActive GoalID:10 DietPreferenceID:5 RegionID:5 AllergenID:10
DistilBERT: Gender:Male Age:40 Height:150 Weight:86 ActivityLevel:LightlyActive GoalID:1 DietPreferenceID:5 RegionID:4 AllergenID:4
MNB: Gender:Female Age:48 Height:196 Weight:50 ActivityLevel:ModeratelyActive GoalID:10 DietPreferenceID:4 RegionID:10 AllergenID:4
KNN: Gender:Other Age:58 Height:196 Weight:56 ActivityLevel:ModeratelyActive GoalID:10 DietPreferenceID:5 RegionID:5 AllergenID:10
Actual: Gender: Male Age: 94 Height: 170 Weight: 78 ActivityLevel: ModeratelyActive GoalID: 9 DietPreferenceID: 2 RegionID: 10 AllergenID: 10


In [None]:
!pip install sentence_transformers



# Step 5: Evaluation Of Models (Using Pre-Trained Similarity Calculation Model)

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Initialize the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Your info string
info_string = text_to_predict  # Make sure text_to_predict is defined with your actual text

# Dictionary mapping model names to their corresponding text values
model_texts = {
    "DistilBERT": corresponding_text_value_distilbert,
    "SVM": corresponding_text_value_svm,
    "MNB": corresponding_text_value_mnb,
    "KNN": corresponding_text_value_knn
}

# Filter out empty predicted texts
model_texts = {model_name: text for model_name, text in model_texts.items() if text.strip()}

# Proceed only if there are non-empty predicted labels
if model_texts:
    # Encode the info string to get its embedding
    info_embedding = model.encode(info_string, convert_to_tensor=True)

    # Initialize a dictionary to store similarity scores for each model
    similarity_scores = {}

    # Variable to store the text of the most similar model
    most_similar_text = None

    # Iterate over each model text, compute its similarity to the info_string, and store it
    for model_name, text in model_texts.items():
        # Encode the model's text to get its embedding
        text_embedding = model.encode(text, convert_to_tensor=True)

        # Compute cosine similarity
        similarity_score = util.pytorch_cos_sim(info_embedding, text_embedding).item()

        # Store the similarity score
        similarity_scores[model_name] = similarity_score

        # Print the model name and similarity score with the info_string
        print(f"Model: {model_name}, Similarity Score: {similarity_score}")

        # Update the most similar text if this score is the highest
        if most_similar_text is None or similarity_score > similarity_scores.get(most_similar_model, 0):
            most_similar_model = model_name
            most_similar_text = text
            highest_similarity_score = similarity_score

    # Print the most similar model, its text, and similarity score
    print(f"\nThe most similar model to the info string is: {most_similar_model} with text: '{most_similar_text}' and a similarity score of: {highest_similarity_score}")
else:
    print("All predicted labels are empty.")


Model: DistilBERT, Similarity Score: 0.9644266963005066
Model: SVM, Similarity Score: 0.9711982607841492
Model: MNB, Similarity Score: 0.9668653607368469
Model: KNN, Similarity Score: 0.9599484205245972

The most similar model to the info string is: SVM with text: 'Gender:Female Age:47 Height:180 Weight:91 ActivityLevel:ModeratelyActive GoalID:10 DietPreferenceID:5 RegionID:5 AllergenID:10' and a similarity score of: 0.9711982607841492


### Checking manually

In [None]:
print(text_to_predict)
print(most_similar_text)


Gender: Male Age: 94 Height: 170 Weight: 78 ActivityLevel: ModeratelyActive GoalID: 9 DietPreferenceID: 2 RegionID: 10 AllergenID: 10
Gender:Female Age:47 Height:180 Weight:91 ActivityLevel:ModeratelyActive GoalID:10 DietPreferenceID:5 RegionID:5 AllergenID:10


In [None]:
import pickle

# Save the trained DistilBert model to a .pkl file
with open('distilbert_trained_model.pkl', 'wb') as model_file:
    pickle.dump(modelBert, model_file)

# Since the tokenizer is not a standard Python object, it's better to save it using its own save_pretrained method
tokenizer.save_pretrained('distilbert_trained_tokenizer')

# Provide the paths for downloading
distilbert_model_pkl_path = 'distilbert_trained_model.pkl'
distilbert_tokenizer_path = 'distilbert_trained_tokenizer'

distilbert_model_pkl_path, distilbert_tokenizer_path

('distilbert_trained_model.pkl', 'distilbert_trained_tokenizer')