# Load Libraries

In [None]:
import re
import os
import nltk
import json
import emoji
import spacy
import torch
import gensim
import pickle
import hdbscan
import cleanlab
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict

from umap import UMAP
from tqdm.auto import tqdm
from nltk.corpus import stopwords
from IPython.display import display
from torch.utils.data import DataLoader
from wordsegment import load, segment
from nltk.tokenize import word_tokenize
from datasketch import MinHash, MinHashLSH
from cleanlab.outlier import OutOfDistribution
from transformers import BertTokenizer, BertModel
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_distances
from concurrent.futures import ProcessPoolExecutor
from sklearn.metrics import davies_bouldin_score
from nltk.corpus import words

from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')


In [None]:
try:
    nltk.corpus.stopwords.words
except LookupError:
    nltk.download('stopwords')

nltk.download('words')
nltk.download('punkt')
load()
nlp = spacy.load('en_core_web_sm')
english_words = set(words.words())

In [None]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed_all(SEED)

In [None]:
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 500

# Load Data

In [None]:
"""
raw_data_path - path to a file containing applications raw descriptions
categs_defs_path - path to a file containing categories guidelines
current_category - name of the a category to be studied in terms of outliers presence 
"""

raw_data_path = ""
categs_defs_path = ""
current_category = ""

In [None]:
all_data = pd.read_csv(raw_data_path, index_col=False)
category_data = all_data[all_data["c"] == current_category]
categories_definitions_df = pd.read_csv(categs_defs_path)

# Data Processing

## Data Processing

In [None]:
url_pattern = re.compile(
    r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|'
    r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|'
    r'ftp://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
)

email_pattern = re.compile(
    r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
)

In [None]:
def get_lower_description(description):
    return description.lower()

In [None]:
def remove_special_characters(description):
    cleared_description = re.sub(r'[^A-Za-z0-9\s]', ' ', description)
    return cleared_description

In [None]:
def contains_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

In [None]:
def remove_emojis_emoji_lib(text):
    return emoji.replace_emoji(text, replace=' ')

In [None]:
def contains_url(text):
    return bool(url_pattern.search(text))

In [None]:
def remove_urls(text):
    return url_pattern.sub(' ', text)

In [None]:
def contains_email(text):
    return bool(email_pattern.search(text))

In [None]:
def remove_emails(text):
    return email_pattern.sub(' ', text)

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [None]:
def remove_multiple_white_spaces(text):
    return re.sub(r'\s+', ' ', text)

In [None]:
def remove_numerical_values(text):
    return re.sub(r'\d+', '', text)

In [None]:
def process_descriptions(data):
    processed_descriptions = []
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        current_description = row["id"]
        pkn = row["p"]
        # lowering
        lower_description = get_lower_description(current_description)
        # emoji removal
        description_without_emojis = remove_emojis_emoji_lib(lower_description)
        # urls removal
        description_without_urls = remove_urls(description_without_emojis)
        # emails removal
        description_without_emails = remove_emails(description_without_urls)
        # remove numerical values
        descriprion_without_numbers = remove_numerical_values(description_without_emails)
        # special characters removal
        cleared_description = remove_special_characters(descriprion_without_numbers)
        # white spaces removal 
        cleared_description = remove_multiple_white_spaces(cleared_description)
        # stop words removal
        # stopwords_removed = remove_stopwords(cleared_description)
        # store data of the above steps 
        processed_descriptions.append(cleared_description)
    
    return processed_descriptions

In [None]:
category_data["pd"] = process_descriptions(category_data)

### Remove Segmented Words

In [None]:
def get_all_words(data):
    words = list()
    for description in tqdm(data, total=len(data)):
        words.extend(list(set(description.split())))
    return list(set(words))

In [None]:
all_words = get_all_words(category_data["pd"].to_list())

#### Check for Segmented Words

In [None]:
def get_batches(lst, batch_size):
    batches = [lst[i:i + batch_size] for i in range(0, len(lst), batch_size)]
    return batches

In [None]:
all_words_batches = get_batches(all_words, 1000)

In [None]:
def check_segmentation(input_string: str) -> bool:
    seg_words = segment(input_string)
    return seg_words[0] == input_string and len(seg_words) == 1

In [None]:
def get_segmented_words(words):
    global segmented_words
    for word in tqdm(words, total=len(words)):
        if not check_segmentation(word):
            segmented_words[word] = ' '.join(segment(word))
    return segmented_words

In [None]:
segmented_words = dict()

with ProcessPoolExecutor(80) as exe:
    for response in exe.map(get_segmented_words, all_words_batches, chunksize=15):
        segmented_words.update(response)

In [None]:
print(len(all_words))

In [None]:
print(len(segmented_words))

In [None]:
len(all_words) - len(segmented_words)

#### Replace Segmented Words

In [None]:
def replace_segmented_words(data, segmented_words_dict):
    no_segmented_words = list()
    for idx, row in tqdm(data.iterrows(), total=len(data)): 
        filtered_words = []
        for word in row["pd"].split():
            if word in segmented_words_dict:
                filtered_words.append(segmented_words_dict[word])
            else:
                filtered_words.append(word)
        no_segmented_words.append(' '.join(filtered_words))
    return no_segmented_words

In [None]:
category_data["pd"] = replace_segmented_words(
    category_data, 
    segmented_words
)

### Remove Non-English Words

In [None]:
from spellchecker import SpellChecker

In [None]:
spell = SpellChecker()

In [None]:
all_words = get_all_words(category_data["pd"].to_list())

In [None]:
len(all_words)

In [None]:
non_eng_words = []
for word in all_words:
    if word not in spell:
        non_eng_words.append(word)

In [None]:
len(non_eng_words)

In [None]:
def remove_non_eng_words(data, non_eng_words):
    without_non_english_words = list()
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        filtered_words = [word for word in row["pd"].split() if word not in non_eng_words]
        updated_description = ' '.join(filtered_words) 
        without_non_english_words.append(updated_description)
    return without_non_english_words

In [None]:
category_data["pd"] = remove_non_eng_words(category_data, non_eng_words)

### Remove Stop Words

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [None]:
def remove_sw_from_data(data):
    no_stop_words = list()
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        no_stop_words.append(remove_stopwords(row["pd"]))
    return no_stop_words

In [None]:
category_data["pd"] = remove_sw_from_data(category_data)

### Remove Words With <= 4 characters

In [None]:
no_small_words = []

for idx, row in category_data.iterrows():
    new_sentence = []
    for word in row["pd"].split(" "):
        if len(word) > 4:
            new_sentence.append(word)

    no_small_words.append(" ".join(new_sentence))

In [None]:
category_data["pd"] = no_small_words

In [None]:
len(category_data)

In [None]:
category_data_filtered = category_data[category_data['pd'].str.split(" ").str.len() > 2]

In [None]:
len(category_data_filtered)

# Outliers Detection with OutOfDistribution

## Load Processed Data

In [None]:
"""
category_data_path - path to a file containing already preprocessed applications descriptions
"""
# category_data_path = 

In [None]:
category_data = pd.read_csv(category_data_path, index_col=False)
categories_definitions_df["p"] = [category.lower() for category in categories_definitions_df["p"].to_list()]

## Prepare Environment

In [None]:
def ensure_directory_exists(path):
    if not os.path.isdir(path):
        os.makedirs(path)
        print(f"Directory created at: {path}")
    else:
        print(f"Directory already exists at: {path}")

In [None]:
category_path = "data/" + current_category.lower()
ensure_directory_exists(category_path)

## Split the Dataset

In [None]:
def select_current_category_train_and_test_data(data_df):
    to_use_for_train_df = data_df[data_df["is_popular_app"] == True]
    to_use_for_test_df = data_df[data_df["is_popular_app"] == False]
    
    return to_use_for_train_df, to_use_for_test_df

In [None]:
to_use_for_train_df, to_use_for_test_df = select_current_category_train_and_test_data(category_data)

In [None]:
# Assign to train data the category definition
to_use_for_train_df.loc[len(to_use_for_train_df)] = [current_category,  
                                                     categories_definitions_df[categories_definitions_df["p"] == current_category]["id"].tolist()[0],
                                                     current_category,
                                                     current_category,
                                                     True,
                                                     categories_definitions_df[categories_definitions_df["p"] == current_category]["pd"].tolist()[0]]

## Getting Data Embeddings

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.eval()

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
def get_embeddings(data):
    description_embeddings  = []
    feature_embeddings = []
    pkns_and_embeddings = dict()

    for idx, row in tqdm(data.iterrows(), total=len(data)):
        description = row['pd']
        pkn = row['p']
        encoded_description = (tokenizer(description, 
                                        padding=True,
                                        truncation=True,
                                        return_tensors='pt'))
        with torch.no_grad():
            encoded_description = encoded_description.to(device)
            model_description_output = model(**encoded_description)
    
        pooled_description = mean_pooling(model_description_output,
                                          encoded_description['attention_mask']).cpu().numpy()
        description_embeddings.extend(pooled_description)
        pkns_and_embeddings[pkn] = pooled_description
        

    embeddings = np.concatenate([np.array(description_embeddings),
                                ], axis=1)

    return embeddings, pkns_and_embeddings

In [None]:
train_embeddings, train_pkns_and_embeddings = get_embeddings(to_use_for_train_df)

In [None]:
test_embeddings, test_pkns_and_embeddings = get_embeddings(to_use_for_test_df)

In [None]:
import torch, gc
del tokenizer
del model
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

## Compute the Outliers

In [None]:
from cleanlab.rank import find_top_issues

In [None]:
def get_ood_scores(train_embeddings, test_embeddings):
    ood = OutOfDistribution()
    train_outlier_scores = ood.fit_score(features=train_embeddings)
    test_outliers_scores = ood.score(features=test_embeddings)

    return train_outlier_scores, test_outliers_scores

In [None]:
train_outlier_scores, test_outliers_scores = get_ood_scores(train_embeddings, 
                                                            test_embeddings)

In [None]:
percentile_value = 5
threshold = np.percentile(train_outlier_scores, percentile_value)
threshold

In [None]:
fifth_percentile = np.percentile(train_outlier_scores, 5)

# Plot outlier_score distributions and the 5th percentile cutoff
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
plt_range = [min(train_outlier_scores.min(), test_outliers_scores.min()), \
             max(train_outlier_scores.max(), test_outliers_scores.max())]
axes[0].hist(train_outlier_scores, range=plt_range, bins=50)
axes[0].set(title='train_outlier_scores distribution', ylabel='Frequency')
axes[0].axvline(x=fifth_percentile, color='red', linewidth=2)
axes[1].hist(test_outliers_scores, range=plt_range, bins=50)
axes[1].set(title='test_outlier_scores distribution', ylabel='Frequency')
axes[1].axvline(x=fifth_percentile, color='red', linewidth=2)

plt.show()

In [None]:
sorted_ids = test_outliers_scores.argsort()
sorted_ids = np.sort(sorted_ids)
outlier_scores = test_outliers_scores[sorted_ids]
outlier_ids = sorted_ids[outlier_scores < threshold] # descriptions flagged as outliers
good_samples_ids = sorted_ids[outlier_scores >= threshold]

In [None]:
print("Number of good samples in test:", len(good_samples_ids))
print("Number of identified outliers in test:", len(outlier_ids))

In [None]:
selected_outlier_subset = to_use_for_test_df.iloc[outlier_ids.tolist()]
selected_good_samples_subset = to_use_for_test_df.iloc[good_samples_ids.tolist()]

In [None]:
selected_outlier_subset_embeddings = test_embeddings[outlier_ids]

# Cluster Outliers

In [None]:
def get_config_pairs(list1, list2, list3):
    pairs_list = []
    for nn in list1:
        for msv in list2:
            for msc in list3:
                pairs_list.append((nn, msv, msc))
    return pairs_list

In [None]:
def cluster_outliers_no_plots(min_size, min_samples, metr, outliers_embeddings, 
                     train_outlier_scores, current_category):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size, 
                                min_samples=min_samples,
                                max_cluster_size=len(outliers_embeddings)/2,
                                metric=metr)
    clusterer.fit(outliers_embeddings)

    return clusterer.labels_

In [None]:
def evaluate_umap(selected_outlier_subset_embeddings, 
                  n_neighbors_values,
                  metr,
                  already_did_pairs,
                  min_size_values=[2, 3, 5, 10, 15, 20, 30, 50, 100],
                  min_samples_values=[2, 3, 5, 10, 15, 20, 30, 50, 100]
                  ):
    silhouette_scores = dict()
    davies_bouldin_index = dict()
    to_do_pairs_list = get_config_pairs(n_neighbors_values, min_size_values, min_samples_values)
    pairs_to_process = []

    for pair in to_do_pairs_list:
        if pair not in already_did_pairs:
            pairs_to_process.append(pair)
            
    for m in metr:
        for pair in tqdm(pairs_to_process, total=len(pairs_to_process)):
            n = pair[0]
            min_size = pair[1]
            min_samples = pair[2]

            reducer = UMAP(n_neighbors=n, 
                           n_components=2,
                           metric=m,
                           random_state=42)
            embedding_umap = reducer.fit_transform(selected_outlier_subset_embeddings)

            labels = cluster_outliers_no_plots(
                        min_size,
                        min_samples, 
                        m,
                        embedding_umap, 
                        train_outlier_scores,
                        current_category
            )

            if len(set(labels)) > 1:
                score = silhouette_score(embedding_umap, labels)
                db_index = davies_bouldin_score(embedding_umap, labels)
                print("n =", n, 
                      "| min_size =", min_size, 
                      "| min_samples =", min_samples,
                      "| metric =", m,
                      "| sil score =", score,
                      "| davies index =", db_index)
                silhouette_scores[(n, min_size, min_samples, m)] = score
            
    return silhouette_scores

In [None]:
n_neighbors_values = [2, 3, 5, 10, 15, 20, 30, 50, 100]
metric = ["euclidean"]

In [None]:
scores = evaluate_umap(selected_outlier_subset_embeddings, 
                       n_neighbors_values,
                       metric, 
                       pairs_list)

In [None]:
max_score = max(scores, key=lambda k: scores[k])
nn, min_size, min_samples, metric = max_score

In [None]:
max_score

In [None]:
scores[max_score]

In [None]:
sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

In [None]:
sorted_scores

# Get clusters based on highest silouete score 

In [None]:
def plot_clusters(train_outlier_scores,
                  current_category,
                  min_size, min_samples,
                  outliers_embeddings,
                  clusterer):
    cluster_labels = clusterer.labels_
    
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(30, 15))
    plt_range = [train_outlier_scores.min(), train_outlier_scores.max()]

    axes[0].hist(train_outlier_scores, range=plt_range, bins=50)
    axes[0].set(title=current_category + " Outlier Scores Distribution", ylabel='Frequency')
    axes[0].axvline(x=threshold, color='red', linewidth=2)

    x_plot, y_plot = outliers_embeddings[:, 0], outliers_embeddings[:, 1]
    clusters_no = len(set(cluster_labels))
    cmap = plt.get_cmap('turbo', clusters_no)
    colors = cmap(np.linspace(0, 1, clusters_no))

    for i in np.unique(cluster_labels):
        if i != -1:
            x, y = x_plot[cluster_labels == i], y_plot[cluster_labels == i]
            axes[1].scatter(x, y, label=f"cluster {i}", c=colors[i])
    
    x, y = x_plot[cluster_labels == -1], y_plot[cluster_labels == -1]
    
    
    axes[1].scatter(x, y, label="outliers", color="gray", alpha=0.15)
    axes[1].set(title=current_category + " UMAP Clustered Outliers")
    axes[1].legend()

    clusterer.condensed_tree_.plot(select_clusters=True, 
                                   selection_palette=colors,
                                   axis=axes[2])

    axes[2].set(title=current_category + " HDBSCAN Condensed Tree")

    plt.tight_layout()
    fig_name = category_path + "/" + current_category + "_" + str(min_size) + "_" + str(min_samples) + "_" + str(percentile_value) + "_plot.jpeg"
    plt.savefig(fig_name, format='jpeg', dpi=300) 
    pdf_name = category_path + "/" + current_category + "_" + str(min_size) + "_" + str(min_samples) + "_" + str(percentile_value) + "_plot.pdf"
    plt.savefig(pdf_name)  
    print("Plots and PDF saved!")

In [None]:
def cluster_outliers(min_size, min_samples, metr, outliers_embeddings, 
                     train_outlier_scores, current_category):

    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_size, 
                                min_samples=min_samples,
                                max_cluster_size=len(outliers_embeddings)/2,
                                metric=metr)
    clusterer.fit(outliers_embeddings)

    plot_clusters(
        train_outlier_scores, 
        current_category,
        min_size, min_samples,
        outliers_embeddings,
        clusterer
    )

    return clusterer.labels_

In [None]:
umap_fit = UMAP(n_components=2, n_neighbors=nn, random_state=SEED)
selected_outlier_subset_embeddings_umap = umap_fit.fit_transform(selected_outlier_subset_embeddings)

In [None]:
cluster_labels = cluster_outliers(
    min_size, min_samples, metric,
    selected_outlier_subset_embeddings_umap, 
    test_outliers_scores, current_category)

In [None]:
selected_outlier_subset["cluster_labels"] = cluster_labels

In [None]:
selected_outlier_subset["cluster_labels"].value_counts().to_dict()

# Inspect Outliers Clusters using LLMs

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
import time
import random
import deepspeed
import pandas as pd
from huggingface_hub import login
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import init_empty_weights, infer_auto_device_map

login(token = 'login_token')

In [None]:
outliers_data_for_llm = selected_outlier_subset.drop(columns=["ic", "is_popular_app"])

In [None]:
cluster_samples_distribution = outliers_data_for_llm["cluster_labels"].value_counts().to_dict()

## Load Model & Tokenizer

In [None]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
tokenizer.padding_side = "left"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

## Initialize System Prompt

In [None]:
SYSTEM_PROMPT = """
 You are an expert in summarizing Android applications' standard functionalities and usage scope based on their descriptions. Your goal is to evaluate the overall functions and scopes of applications from a given set of applications' descriptions, highlighting their standard functionalities in a concise and accessible manner.

 Guidelines: 
 1. Neutral Summary: 
 - For all applications descriptions provided, give one single functionality-based summary.
 - Provide a concise and unbiased summary of the applications' functionalities and usage scope. 
 - Avoid assumptions or unsupported conclusions. Maintain a neutral and constructive tone.

 2. Data Focus:
 - Analyze applications' descriptions to generate a general summary based on their standard functionalities.

 3. Applications Descriptions Weighting:
 - Consider each application description equally, regardless of the word count, and asses the described functionalities points consistently. Avoid giving disproportionate weight to longer application descriptions. 

 4. Tone: 
 - Adopt a neutral and slightly cautious tone when summarizing the application's general functionalities.
 - Ensure the overall summary remains professional, fair, and approachable.

 5. Simplicity:
 - Use simple, accessible language without technical jargon or complex vocabulary.

 6. Conciseness:
 - Provide insights concisely, focusing on essential details and functionalities.

 Output Requirements (JSON):
 Provide the overall application functionalities summary strictly in the following JSON format:
 {
 "Summary": "[Short and concise general description of applications' functionalities and scope capturing the essential information and their common scope, around 100-200 words]"
 }
"""

In [None]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
max_cluster_len = 600
max_words_no = 20

In [None]:
def process_cluster_data(cluster_data, max_words_no):
    new_cluster_data = []
    for data in cluster_data:
        new_data = " ".join(list(set(data.split(" ")))[:max_words_no])
        new_cluster_data.append(new_data)
    return  new_cluster_data

## Generate Responses 

In [None]:
llama_responses = dict()

In [None]:
for cluster_id in tqdm(cluster_samples_distribution, total=len(cluster_samples_distribution)):
    print("Cluster ID:", cluster_id)
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    start_time = time.time()
    
    cluster_data = selected_outlier_subset[selected_outlier_subset["cluster_labels"] == cluster_id]["pd"].tolist()
    processed_data = process_cluster_data(cluster_data, max_words_no)
    random.shuffle(processed_data)

    USER_PROMPT = "\n".join(processed_data[:max_cluster_len])
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ]
    formatted_text = tokenizer.apply_chat_template(messages,
                                                   tokenize=False)

    input_ids = tokenizer(formatted_text, return_tensors="pt").to(model.device)
    output_ids = model.generate(input_ids["input_ids"], 
                                attention_mask=input_ids["attention_mask"],
                                max_new_tokens=256,
                                num_beams=5,
                                early_stopping=True,
                                top_p=0.9,
                                top_k=50,
                                eos_token_id=tokenizer.eos_token_id,
                                pad_token_id=tokenizer.pad_token_id)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    llama_responses[cluster_id] = response.split("\"Summary\": ")[-1]
    end_time = time.time()

In [None]:
cluster_data.head(20)

In [None]:
for cluster, response in llama_responses.items():
    print(cluster, response)
    print("______________________________________________________________________________________")

In [None]:
del model 
del tokenizer

In [None]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()