# **Data Preparation**

In [1]:
!pip install transformers wandb gensim sentencepiece -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.4/311.4 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import json
import string
import re
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch.nn as nn
from torch.cuda.amp import autocast
import multiprocessing
from gensim.models import Word2Vec
import wandb
from google.colab import userdata
import gdown

In [3]:
file_id = '1LoIkGczZJZVTz88_xJg3aBDWpALEqGla'
url = f'https://drive.google.com/uc?export=download&id={file_id}'

gdown.download(url, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1LoIkGczZJZVTz88_xJg3aBDWpALEqGla
From (redirected): https://drive.google.com/uc?export=download&id=1LoIkGczZJZVTz88_xJg3aBDWpALEqGla&confirm=t&uuid=993236c9-e439-47b5-81b0-e599c4660555
To: /content/umid.json
100%|██████████| 381M/381M [00:07<00:00, 51.7MB/s]


'umid.json'

In [4]:
gdown.download_folder('https://drive.google.com/drive/folders/1dWP_krhq_jSZdxmYN4gCQXDpLfmAWN7l?usp=sharing', output='uzbek_xlm_roberta_model')

Retrieving folder contents


Processing file 1K6y8qGq-yPU32LeWhafyj9ZRmL3CeCGB config.json
Processing file 1v62TaVs1gWlWuyFYAC9k0QXdJq3mh-KU model.safetensors


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1K6y8qGq-yPU32LeWhafyj9ZRmL3CeCGB
To: /content/uzbek_xlm_roberta_model/config.json
100%|██████████| 709/709 [00:00<00:00, 2.27MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1v62TaVs1gWlWuyFYAC9k0QXdJq3mh-KU
From (redirected): https://drive.google.com/uc?id=1v62TaVs1gWlWuyFYAC9k0QXdJq3mh-KU&confirm=t&uuid=93f3a0a7-2675-47b8-b281-861d321c1f71
To: /content/uzbek_xlm_roberta_model/model.safetensors
100%|██████████| 2.24G/2.24G [00:47<00:00, 47.5MB/s]
Download completed


['uzbek_xlm_roberta_model/config.json',
 'uzbek_xlm_roberta_model/model.safetensors']

In [5]:
gdown.download_folder('https://drive.google.com/drive/folders/1UDLQbCEkdS5DWKokzl-NFqxHlBZ6MHK_?usp=sharing', output='uzbek_xlm_roberta_tokenizer')

Retrieving folder contents


Processing file 10obthWQsAOOTtr8Qx7O_qaJhQkn-upVG sentencepiece.bpe.model
Processing file 1Dao4duhJz3lGdjYMMTG1zztfXSJQWLPb special_tokens_map.json
Processing file 1wzGPT51CaZGBmGbUIW54Jf7LV6MBe0Mr tokenizer_config.json


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=10obthWQsAOOTtr8Qx7O_qaJhQkn-upVG
To: /content/uzbek_xlm_roberta_tokenizer/sentencepiece.bpe.model
100%|██████████| 5.07M/5.07M [00:00<00:00, 31.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Dao4duhJz3lGdjYMMTG1zztfXSJQWLPb
To: /content/uzbek_xlm_roberta_tokenizer/special_tokens_map.json
100%|██████████| 280/280 [00:00<00:00, 1.06MB/s]
Downloading...
From: https://drive.google.com/uc?id=1wzGPT51CaZGBmGbUIW54Jf7LV6MBe0Mr
To: /content/uzbek_xlm_roberta_tokenizer/tokenizer_config.json
100%|██████████| 1.17k/1.17k [00:00<00:00, 4.10MB/s]
Download completed


['uzbek_xlm_roberta_tokenizer/sentencepiece.bpe.model',
 'uzbek_xlm_roberta_tokenizer/special_tokens_map.json',
 'uzbek_xlm_roberta_tokenizer/tokenizer_config.json']

In [6]:
model_dir = "/content/uzbek_xlm_roberta_model"
tokenizer_dir = "/content/uzbek_xlm_roberta_tokenizer"

tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_dir)
model_embedding = XLMRobertaModel.from_pretrained(model_dir).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
model_embedding.eval()

Some weights of XLMRobertaModel were not initialized from the model checkpoint at /content/uzbek_xlm_roberta_model and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwi

In [7]:
# Load dataset
df_roberta = pd.read_json('umid.json')

In [8]:
# Remove unnecessary text in all rows
df_roberta['related_texts'] = df_roberta['related_texts'].apply(lambda texts: [t.replace('Hujjatga taklif yuborish Audioni tinglash', "") for t in texts])

In [15]:
def preprocess_okoz(df, number):
    """
    Function to preprocess 'okoz_text' in a dataframe based on a specified number.
    """
    # Check if 'okoz_text' contains the number followed by ".0"
    def contains_number_point_zero(okoz_list):
        return any(f'{number}.0' in item for item in okoz_list)

    # Filter by length, keep elements starting with number, and process text
    def process_okoz_text(text_list):
        filtered = [text for text in text_list if len(text) > 4 and text.startswith(f'{number}')]
        processed = [text.split('/')[1].strip() if '/' in text else text for text in filtered]
        return processed

    # General text preprocessing
    def preprocess_text(text):
        text = text.lower().translate(str.maketrans('', '', string.punctuation))
        return text

    # Remove duplicates and semicolons
    def clean_text(text_list):
        cleaned = list(set(text_list))
        return [text.replace(';', '') for text in cleaned]

    # Apply the filter and preprocessing functions
    df_filtered = df[df['okoz_text'].apply(contains_number_point_zero)].copy()  # Always use .copy() to avoid warnings
    df_filtered.loc[:, 'okoz_text'] = df_filtered['okoz_text'].apply(process_okoz_text)  # Use .loc to modify
    df_filtered = df_filtered[df_filtered['okoz_text'].apply(lambda x: len(x) > 0)].copy()  # Re-filter, then .copy()
    df_filtered.loc[:, 'okoz_text'] = df_filtered['okoz_text'].apply(clean_text)  # Use .loc for assignment
    df_filtered = df_filtered[df_filtered['okoz_text'].apply(len) == 1].copy()  # Keep rows where list has exactly 1 element
    df_filtered.loc[:, 'okoz_text'] = df_filtered['okoz_text'].apply(lambda x: ' '.join(x).replace("]", ""))   # Flatten the list to a string
    df_filtered = df_filtered.reset_index(drop=True)

    return df_filtered

In [16]:
df_04  = preprocess_okoz(df_roberta, "04")

In [32]:
df_04.okoz_text.value_counts(sort=df_04['okoz_text'])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [33]:
# prompt: sort df_04.okoz_text.value_counts alphabetically

sorted_counts = df_04.okoz_text.value_counts().sort_index()
print(sorted_counts)


okoz_text
04.01.00.00 Umumiy qoidalar                                                                                                33
04.02.00.00 Nikoh                                                                                                          54
04.03.00.00 Bolalarning kelib chiqishini belgilash                                                                          2
04.04.00.00 Ota-onalar va bolalarning hamda boshqa shaxslarning huquq va majburiyatlari. Aliment majburiyatlari            29
04.05.00.00 Ota-onalar qarovisiz qolgan bolalarni tarbiyalash shakllari                                                   113
04.06.00.00 Fuqarolik holati dalolatnomalarini qayd qilish (shuningdek, 03.02.08.00ga qarang)                             252
04.07.00.00 Familiya, ism va ota ismini o‘zgartirish                                                                        3
04.08.00.00 Oila, onalik, otalik va bolalikni himoya qilish va ijtimoiy qo‘llab-quvvatlash                  

In [None]:
def preprocess_all_numbers(df, start=1, end=21):
    """
    Function to preprocess the dataframe for all numbers from start to end.
    Concatenates results into one DataFrame.
    """
    numbers = [f'{i:02}' for i in range(start, end + 1)]
    df_list = [preprocess_okoz(df, number) for number in numbers]

    # Concatenate all processed DataFrames
    df_all = pd.concat(df_list, ignore_index=True)

    return df_all

In [None]:
df_all = preprocess_all_numbers(df_roberta)

In [18]:
def preprocess_text(df):
    """
    Preprocesses the 'related_texts' column in the DataFrame by cleaning and standardizing the text.
    """

    def clean_individual_text(text):
        # Lowercase the text and replace curly quotes/backticks with standard single quotes
        text = text.lower()
        text = text.replace('‘', "'").replace('’', "'").replace('`', "'")

        # Remove all non-alphabetical characters except periods, single quotes, and spaces
        text = re.sub(r'[^a-z\.\'\s]', '', text)

        # Replace multiple spaces with a single space and strip leading/trailing spaces
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    # Join list of texts into a single string, then apply the cleaning function
    df.loc[:, 'related_texts'] = df['related_texts'].apply(lambda x: ' '.join(x))  # Use .loc to avoid the warning
    df.loc[:, 'related_texts'] = df['related_texts'].apply(clean_individual_text)  # Apply cleaning function

    return df

In [19]:
df_new = preprocess_text(df_04)

In [20]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   okoz_text      587 non-null    object
 1   related_texts  587 non-null    object
dtypes: object(2)
memory usage: 9.3+ KB


# **Roberta Embedding**

In [24]:
def preprocess_data_embedd(df, model, tokenizer, device, batch_size=64, max_length=256):
    # Clean texts without adding a new column
    df['related_texts'] = df['related_texts'].fillna('')  # Ensure no NaN values

    # Function to embed the document
    def embed_document(df, model, tokenizer, device, batch_size=64, max_length=256):
        texts = df['related_texts'].tolist()

        # Tokenize texts and prepare DataLoader
        inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
        dataloader = DataLoader(dataset, batch_size=batch_size, pin_memory=True, num_workers=2)

        embeddings_list = []
        model.to(device)
        for batch in dataloader:
            input_ids, attention_mask = [t.to(device, non_blocking=True) for t in batch]
            with torch.amp.autocast(device_type='cuda'):  # Mixed precision for efficiency
                with torch.no_grad():
                    outputs = model(input_ids, attention_mask=attention_mask)
                    embeddings = torch.mean(outputs.last_hidden_state, dim=1)
            embeddings_list.extend(embeddings.cpu().numpy())  # Move to CPU memory to free GPU memory

        df['embeddings'] = embeddings_list
        return df

        # Assign label 1 to "04.01.00.00 Umumiy qoidalar" and 0 to others
    df['label'] = (df['okoz_text'] == '04.01.00.00 Umumiy qoidalar').astype(int)

    # Embed document
    df = embed_document(df, model, tokenizer, device, batch_size, max_length)

    # Drop the unnecessary column
    df = df.drop(columns=['okoz_text'])

    # Print GPU memory usage stats
    print(f"Memory allocated: {torch.cuda.memory_allocated(device) / 1024**3:.2f} GB")
    print(f"Memory reserved: {torch.cuda.memory_reserved(device) / 1024**3:.2f} GB")

    return df

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [26]:
df_embed = preprocess_data_embedd(df_new, model_embedding, tokenizer, device)

Memory allocated: 2.09 GB
Memory reserved: 3.91 GB


In [82]:
df_embed.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,55
1,33


In [81]:
# prompt: only take 10 percent of label 0

import pandas as pd
# Separate the DataFrame based on label
df_label_0 = df_embed[df_embed['label'] == 0]
df_label_1 = df_embed[df_embed['label'] == 1]

# Sample 10% of label 0 data
df_label_0_sampled = df_label_0.sample(frac=0.1, random_state=42)

# Concatenate the sampled label 0 data with label 1 data
df_embed = pd.concat([df_label_0_sampled, df_label_1], ignore_index=True)


In [None]:
df_embed.to_csv('roberta_embedded_with_text.csv', index=False)
df_embed.to_json('roberta_embedded_with_texts.json', index=False)

!cp "/content/roberta_embedded_with_text.csv" "/content/drive/MyDrive/Lexuz Project/Embeddings/Roberta_Uzbek/Embedded Data"
!cp "/content/roberta_embedded_with_texts.json" "/content/drive/MyDrive/Lexuz Project/Embeddings/Roberta_Uzbek/Embedded Data"

In [None]:
df_embeded = df_embed[['embeddings', 'label']]

In [None]:
df_embeded.to_csv('roberta_embedded.csv', index=False)
df_embeded.to_json('roberta_embedding.json', index=False)

!cp "/content/roberta_embedded.csv" "/content/drive/MyDrive/Lexuz Project/Embeddings/Roberta_Uzbek/Embedded Data"
!cp "/content/roberta_embedding.json" "/content/drive/MyDrive/Lexuz Project/Embeddings/Roberta_Uzbek/Embedded Data"

In [None]:
label_df = pd.DataFrame(list(label_to_numeric.items()), columns=['okoz_text', 'label'])

In [None]:
label_df.to_csv('label_to_numeric.csv', index=False)

!cp "/content/label_to_numeric.csv" "/content/drive/MyDrive/Lexuz Project/Embeddings/Roberta_Uzbek"

In [83]:
def prep_model(df, batch_size):
    X = np.array(df['embeddings'].tolist())
    y = np.array(df['label'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
    test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

In [84]:
train_loader, test_loader = prep_model(df_embed, 32)

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = df_embed['label'].nunique()
input_dim = np.array(df_embed['embeddings'][0]).shape[0]

In [61]:
def train_model(train_loader, model, criterion, optimizer, scheduler, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            # Convert labels to float
            labels = labels.unsqueeze(1).float() # Ensure labels are float and have shape [batch_size, 1]
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        scheduler.step(epoch_loss)
        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.6f}")

    print("Training Complete")



---



In [92]:
def train_model(train_loader, model, criterion, optimizer, scheduler, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.unsqueeze(1).float())  # Ensure labels are float and reshaped correctly
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)  # Accumulate the loss based on the batch size

        epoch_loss = running_loss / len(train_loader.dataset)
        scheduler.step(epoch_loss)

        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.6f}")

    print("Training Complete")

In [93]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0.0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)

            # Apply sigmoid and threshold to convert logits to binary predictions
            predictions = torch.sigmoid(outputs) > 0.5  # Threshold at 0.5
            loss = criterion(outputs, labels.unsqueeze(1).float())

            total_loss += loss.item() * inputs.size(0)
            correct += (predictions.squeeze().long() == labels).sum().item()  # Count correct predictions
            total += labels.size(0)

    accuracy = 100 * correct / total
    avg_loss = total_loss / total
    print(f"Test Accuracy: {accuracy:.2f}%, Test Loss: {avg_loss:.4f}")

    return accuracy, avg_loss

In [94]:
import torch.optim as optim

class OptimizedNN(nn.Module):
    def __init__(self, input_dim):
        super(OptimizedNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(0.4)
        self.bn1 = nn.BatchNorm1d(256)
        self.bn2 = nn.BatchNorm1d(128)

    def forward(self, x):
        x = torch.nn.functional.leaky_relu(self.bn1(self.fc1(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = torch.nn.functional.leaky_relu(self.bn2(self.fc2(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = self.fc3(x)  # No activation here, as BCEWithLogitsLoss will handle it
        return x

# Initialize model, criterion, optimizer, and scheduler
model = OptimizedNN(input_dim).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Training and evaluation functions remain the same
train_model(train_loader, model, criterion, optimizer, scheduler, num_epochs=100)
accuracy, avg_loss = evaluate_model(model, test_loader, criterion)

Epoch 1/100, Loss: 0.633812
Epoch 11/100, Loss: 0.164766
Epoch 21/100, Loss: 0.078261
Epoch 31/100, Loss: 0.027230
Epoch 41/100, Loss: 0.021581
Epoch 51/100, Loss: 0.039039
Epoch 61/100, Loss: 0.086096
Epoch 71/100, Loss: 0.015216
Epoch 81/100, Loss: 0.022515
Epoch 91/100, Loss: 0.027602
Training Complete
Test Accuracy: 92.86%, Test Loss: 0.3942


In [97]:

# prompt: show that which question it wrongly predict and what was correct label without embedding column

import pandas as pd
import torch

# Assuming 'df_embed' is your DataFrame and 'test_loader' is your DataLoader

def get_wrong_predictions(model, test_loader, df_embed):
    """
    Identifies wrongly predicted instances and returns a DataFrame with the incorrect predictions and true labels.
    """
    model.eval()
    wrong_predictions = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predictions = torch.sigmoid(outputs) > 0.5

            for i in range(len(labels)):
                if predictions[i].item() != labels[i].item():
                    # Get the index of the current instance in the original DataFrame
                    index = (len(labels) * test_loader.batch_size) + i
                    if index < len(df_embed):  # Ensure index is within bounds
                        # Get the related_texts and label from the original DataFrame
                        related_texts = df_embed.iloc[index]['related_texts']
                        true_label = df_embed.iloc[index]['label']
                        predicted_label = predictions[i].item()
                        wrong_predictions.append({'related_texts': related_texts, 'true_label': true_label, 'predicted_label': predicted_label})

    return pd.DataFrame(wrong_predictions)

# Get the DataFrame with wrong predictions
wrong_predictions_df = get_wrong_predictions(model, test_loader, df_embed)

# Display the results


In [98]:
wrong_predictions_df



---



In [95]:
# prompt: show that which question it wrongly predict and what was correct label without embedding column

import pandas as pd
import torch

# Assuming 'df_embed' is your DataFrame and 'test_loader' is your DataLoader

def get_wrong_predictions(model, test_loader, df_embed):
    """
    Identifies wrongly predicted instances and returns a DataFrame with the incorrect predictions and true labels.
    """
    model.eval()
    wrong_predictions = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predictions = torch.sigmoid(outputs) > 0.5

            for i in range(len(labels)):
                if predictions[i].item() != labels[i].item():
                    # Get the index of the current instance in the original DataFrame
                    index = (len(labels) * test_loader.batch_size) + i
                    if index < len(df_embed):  # Ensure index is within bounds
                        # Get the related_texts and label from the original DataFrame
                        related_texts = df_embed.iloc[index]['related_texts']
                        true_label = df_embed.iloc[index]['label']
                        predicted_label = predictions[i].item()
                        wrong_predictions.append({'related_texts': related_texts, 'true_label': true_label, 'predicted_label': predicted_label})

    return pd.DataFrame(wrong_predictions)

# Get the DataFrame with wrong predictions
wrong_predictions_df = get_wrong_predictions(model, test_loader, df_embed)

# Display the results


In [96]:
wrong_predictions_df

In [66]:
def evaluate_model(model, test_loader, criterion, df):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0.0
    incorrect_predictions = []

    with torch.no_grad():
        for i, (inputs, labels) in enumerate(test_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            # Ensure labels are float and have shape [batch_size, 1]
            labels = labels.unsqueeze(1).float()
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    avg_loss = total_loss / total
    print(f"Test Accuracy: {accuracy:.2f}%, Test Loss: {avg_loss:.4f}")

    return accuracy, avg_loss, incorrect_predictions

In [67]:
class AdvancedNNSecond(nn.Module):
    def __init__(self, input_dim):
        super(AdvancedNNSecond, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # Increased neurons
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 128)
        self.fc6 = nn.Linear(128, 1) # This layer outputs a single value
        self.dropout = nn.Dropout(0.4)  # Adjusted dropout rate
        self.bn1 = nn.BatchNorm1d(128)
        self.bn2 = nn.BatchNorm1d(256)
        self.bn3 = nn.BatchNorm1d(512)
        self.bn4 = nn.BatchNorm1d(256)
        self.bn5 = nn.BatchNorm1d(128)

    def forward(self, x):
        x = torch.nn.functional.leaky_relu(self.bn1(self.fc1(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = torch.nn.functional.leaky_relu(self.bn2(self.fc2(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = torch.nn.functional.leaky_relu(self.bn3(self.fc3(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = torch.nn.functional.leaky_relu(self.bn4(self.fc4(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = torch.nn.functional.leaky_relu(self.bn5(self.fc5(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = self.fc6(x) # Remove sigmoid activation. BCEWithLogitsLoss applies sigmoid internally
        return x

In [68]:
model = AdvancedNNSecond(input_dim).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [69]:
train_model(train_loader, model, criterion, optimizer,scheduler, num_epochs=100)
accuracy, avg_loss, incorrect_predictions = evaluate_model(model, test_loader, criterion, df_embed)

Epoch 1/100, Loss: 0.489243
Epoch 11/100, Loss: 0.128841
Epoch 21/100, Loss: 0.094373
Epoch 31/100, Loss: 0.091612
Epoch 41/100, Loss: 0.047859
Epoch 51/100, Loss: 0.063846
Epoch 61/100, Loss: 0.041919
Epoch 71/100, Loss: 0.047985
Epoch 81/100, Loss: 0.031439
Epoch 91/100, Loss: 0.032553
Training Complete
Test Accuracy: 2823.60%, Test Loss: 0.0471


In [None]:
# prompt: why Test Accuracy: 2823.60%,

# The issue likely stems from a mismatch between your model's output and the expected labels.
# BCEWithLogitsLoss applies sigmoid internally, so you don't need to apply sigmoid in your model's forward pass.
# Additionally, ensure that your labels are in the correct format (e.g., float values).
# Check if your labels are in the correct format.
# Review the model's architecture and ensure it's suitable for the binary classification task.
# Consider adding more layers or adjusting the number of neurons in your model.
# Try different activation functions.
# Experiment with different optimizers and learning rates.
# Adjust the batch size and number of epochs.


In [47]:
class AdvancedNNSecond(nn.Module):
    def __init__(self, input_dim):
        super(AdvancedNNSecond, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # Increased neurons
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 128)
        self.fc6 = nn.Linear(128, 1) # This layer outputs a single value
        self.dropout = nn.Dropout(0.4)  # Adjusted dropout rate
        self.bn1 = nn.BatchNorm1d(128)
        self.bn2 = nn.BatchNorm1d(256)
        self.bn3 = nn.BatchNorm1d(512)
        self.bn4 = nn.BatchNorm1d(256)
        self.bn5 = nn.BatchNorm1d(128)

    def forward(self, x):
        x = torch.nn.functional.leaky_relu(self.bn1(self.fc1(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = torch.nn.functional.leaky_relu(self.bn2(self.fc2(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = torch.nn.functional.leaky_relu(self.bn3(self.fc3(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = torch.nn.functional.leaky_relu(self.bn4(self.fc4(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = torch.nn.functional.leaky_relu(self.bn5(self.fc5(x)), negative_slope=0.01)
        x = self.dropout(x)
        x = self.fc6(x) # Remove sigmoid activation. BCEWithLogitsLoss applies sigmoid internally
        return x

In [None]:
# prompt: make MulticlassModel1 binary classification

class BinaryMulticlassModel1(nn.Module):
    def __init__(self, input_size):
        super(BinaryMulticlassModel1, self).__init__()
        self.layer1 = nn.Linear(input_size, 512)
        self.layer2 = nn.Linear(512, 1024)
        self.layer3 = nn.Linear(1024, 512)
        self.output = nn.Linear(512, 1)  # Output layer for binary classification
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.4)
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.sigmoid = nn.Sigmoid()  # Add sigmoid activation for binary output

    def forward(self, x):
        x = self.relu(self.batch_norm1(self.layer1(x)))
        x = self.dropout(x)
        x = self.relu(self.batch_norm2(self.layer2(x)))
        x = self.dropout(x)
        x = self.relu(self.layer3(x))
        x = self.dropout(x)
        x = self.output(x)
        x = self.sigmoid(x)  # Apply sigmoid activation
        return x


In [42]:
import torch.optim as optim

In [None]:
model = MulticlassModel1(input_size=input_dim, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

In [None]:
train_model(train_loader, model, criterion, optimizer,scheduler, num_epochs=100)
accuracy, avg_loss, incorrect_predictions = evaluate_model(model, test_loader, criterion, df_embed)

Epoch 1/100, Loss: 1.905589
Epoch 11/100, Loss: 1.494001
Epoch 21/100, Loss: 1.408900
Epoch 31/100, Loss: 1.346037
Epoch 41/100, Loss: 1.305883
Epoch 51/100, Loss: 1.269428
Epoch 61/100, Loss: 1.239488
Epoch 71/100, Loss: 1.215465
Epoch 81/100, Loss: 1.197714
Epoch 91/100, Loss: 1.174214
Training Complete
Test Accuracy: 50.50%, Test Loss: 1.4939


In [None]:
from google.colab import drive
drive.mount('/content/drive')
torch.save(model.state_dict(), '/content/drive/MyDrive/Lexuz Project/Model/Roberta_Uzbek_model/model1.pth')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class MulticlassModel2(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MulticlassModel2, self).__init__()
        self.layer1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.layer2 = nn.Linear(512, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.layer3 = nn.Linear(1024, 2048)
        self.bn3 = nn.BatchNorm1d(2048)
        self.layer4 = nn.Linear(2048, 1024)
        self.bn4 = nn.BatchNorm1d(1024)
        self.layer5 = nn.Linear(1024, 512)
        self.bn5 = nn.BatchNorm1d(512)
        self.output = nn.Linear(512, num_classes)

        self.elu = nn.ELU()
        self.dropout1 = nn.Dropout(p=0.3)
        self.dropout2 = nn.Dropout(p=0.4)

    def forward(self, x):
        x = self.elu(self.bn1(self.layer1(x)))
        x = self.dropout1(x)
        x = self.elu(self.bn2(self.layer2(x)))
        x = self.dropout2(x)
        x = self.elu(self.bn3(self.layer3(x)))
        x = self.dropout2(x)
        x = self.elu(self.bn4(self.layer4(x)))
        x = self.dropout2(x)
        x = self.elu(self.bn5(self.layer5(x)))
        x = self.dropout1(x)
        x = self.output(x)
        return x

In [None]:
model = MulticlassModel2(input_size=input_dim, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [None]:
train_model(train_loader, model, criterion, optimizer,scheduler, num_epochs=100)
accuracy, avg_loss, incorrect_predictions = evaluate_model(model, test_loader, criterion, df_embed)

Epoch 1/100, Loss: 1.870201
Epoch 11/100, Loss: 1.474535
Epoch 21/100, Loss: 1.371375
Epoch 31/100, Loss: 1.299886
Epoch 41/100, Loss: 1.239641
Epoch 51/100, Loss: 1.186262
Epoch 61/100, Loss: 1.144104
Epoch 71/100, Loss: 1.107287
Epoch 81/100, Loss: 1.074353
Epoch 91/100, Loss: 1.041186
Training Complete
Test Accuracy: 47.98%, Test Loss: 1.5971


In [None]:
from google.colab import drive
drive.mount('/content/drive')
torch.save(model.state_dict(), '/content/drive/MyDrive/Lexuz Project/Model/Roberta_Uzbek_model/model2.pth')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class MulticlassModel3(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MulticlassModel3, self).__init__()
        self.layer1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.layer2 = nn.Linear(512, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.layer3 = nn.Linear(1024, 2048)
        self.bn3 = nn.BatchNorm1d(2048)
        self.layer4 = nn.Linear(2048, 1024)
        self.bn4 = nn.BatchNorm1d(1024)
        self.layer5 = nn.Linear(1024, 512)
        self.bn5 = nn.BatchNorm1d(512)
        self.output = nn.Linear(512, num_classes)

        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.3)
        self.dropout2 = nn.Dropout(p=0.4)

    def forward(self, x):
        x = self.relu(self.bn1(self.layer1(x)))
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.layer2(x)))
        x = self.dropout2(x)
        x = self.relu(self.bn3(self.layer3(x)))
        x = self.dropout2(x)
        x = self.relu(self.bn4(self.layer4(x)))
        x = self.dropout2(x)
        x = self.relu(self.bn5(self.layer5(x)))
        x = self.dropout1(x)
        x = self.output(x)
        return x

In [None]:
model = MulticlassModel3(input_size=input_dim, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

In [None]:
train_model(train_loader, model, criterion, optimizer,scheduler, num_epochs=100)
accuracy, avg_loss, incorrect_predictions = evaluate_model(model, test_loader, criterion, df_embed)

Epoch 1/100, Loss: 1.917979
Epoch 11/100, Loss: 1.471885
Epoch 21/100, Loss: 1.393305
Epoch 31/100, Loss: 1.345253
Epoch 41/100, Loss: 1.311792
Epoch 51/100, Loss: 1.289091
Epoch 61/100, Loss: 1.268927
Epoch 71/100, Loss: 1.252443
Epoch 81/100, Loss: 1.240743
Epoch 91/100, Loss: 1.226028
Training Complete
Test Accuracy: 49.19%, Test Loss: 1.5185


In [None]:
from google.colab import drive
drive.mount('/content/drive')
torch.save(model.state_dict(), '/content/drive/MyDrive/Lexuz Project/Model/Roberta_Uzbek_model/model3.pth')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#@title Enter Text
text = "O‘zbekiston Respublikasi Vazirlar Mahkamasi: davlat hisobidan yuridik yordam ko‘rsatish sohasida yagona davlat siyosati amalga oshirilishini ta’minlaydi; davlat hisobidan yuridik yordam ko‘rsatish sohasidagi davlat dasturlarini tasdiqlaydi va ularning amalga oshirilishini ta’minlaydi; advokatlar tomonidan davlat hisobidan ko‘rsatilgan yuridik yordam uchun haq to‘lash miqdori va tartibini belgilaydi; davlat hisobidan yuridik yordam ko‘rsatish sohasidagi normativ-huquqiy hujjatlarni o‘z vakolatlari doirasida qabul qiladi." #@param {type:"string"}


In [None]:
import torch
import re
import torch.nn.functional as F

def predict_class(text, model, tokenizer, label_to_numeric, device):
    """
    Predicts the class of the given text using the trained model.

    Args:
        text: The input text.
        model: The trained model.
        tokenizer: The tokenizer used for the model.
        label_to_numeric: A dictionary mapping labels to numeric values.
        device: The device to use for computation (CPU or GPU).

    Returns:
        A list of top 3 predicted classes with their probabilities.
    """

    cleaned_text = text.lower()
    cleaned_text = cleaned_text.replace('‘', "'").replace('’', "'").replace('`', "'")
    cleaned_text = re.sub(r'[^a-z\.\'\s]', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    inputs = tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True, max_length=256)

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model_embedding(**inputs)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1)

    with torch.no_grad():
        model.eval()
        logits = model(embeddings)
        probabilities = F.softmax(logits, dim=1)

    top3_probs, top3_indices = torch.topk(probabilities, 3)

    numeric_to_label = {v: k for k, v in label_to_numeric.items()}
    top3_predictions = [(numeric_to_label[idx.item()], prob.item()) for idx, prob in zip(top3_indices[0], top3_probs[0])]

    return top3_predictions

predictions = predict_class(text, model, tokenizer, label_to_numeric, device)

print("Top 3 Predictions:")
for label, prob in predictions:
    print(f"Class: {label}, Probability: {prob:.4f}")

# **Word2Vec Embedding**

In [None]:
processed_docs = df_new['related_texts'].tolist()

In [None]:
flattened_docs = [word for word in processed_docs]

In [None]:
sentences = [doc.split() for doc in flattened_docs]

In [None]:
model_word2vec = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=1, workers=multiprocessing.cpu_count())

if '<OOV>' not in model_word2vec.wv:
    model_word2vec.wv.add_vector('<OOV>', np.zeros(model_word2vec.vector_size))

model_word2vec.save("word2vec_uzbek.model")

model_word2vec = Word2Vec.load("word2vec_uzbek.model")



In [None]:
# prompt: free ram and gpu

import torch

# Clear GPU cache
torch.cuda.empty_cache()

# Garbage collection
import gc
gc.collect()


531

In [None]:
!cp "/content/word2vec_uzbek.model" "/content/drive/MyDrive/Lexuz Project/Embeddings/Word2Vec"

In [None]:
model_word2vec = Word2Vec.load("word2vec_uzbek.model")

def preprocess_data(df, model):

    def get_embedding(text, model, oov_token='<OOV>'):
        tokens = text.split()
        embeddings = []
        for token in tokens:
            if token in model.wv:
                embeddings.append(model.wv[token])
            else:
                embeddings.append(model.wv[oov_token])
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(model.vector_size)

    # Apply the custom Word2Vec model to get embeddings
    df.loc[:, 'embeddings'] = df['related_texts'].apply(lambda text: get_embedding(text, model))

    return df

In [None]:
df_word_2_vec = preprocess_data(df_new, model_word2vec)

In [None]:
label_to_numeric = {label: idx for idx, label in enumerate(df_word_2_vec['okoz_text'].unique())}
df_word_2_vec['label'] = df_word_2_vec['okoz_text'].map(label_to_numeric)

In [None]:
label_df = pd.DataFrame(list(label_to_numeric.items()), columns=['okoz_text', 'label'])

label_df.to_csv('label_to_numerics.csv', index=False)

!cp "/content/label_to_numeric.csv" "/content/drive/MyDrive/Lexuz Project/Embeddings/Word2Vec"

cp: cannot stat '/content/label_to_numeric.csv': No such file or directory


In [None]:
!cp "/content/label_to_numerics.csv" "/content/drive/MyDrive/Lexuz Project/Embeddings/Word2Vec"

In [None]:
df_word_2_vec.drop(columns=["okoz_text"],inplace=True)

In [None]:
df_word_2_vec.to_csv('word2vec_embedded_with_text.csv', index=False)
df_word_2_vec.to_json('word2vec_embedded_with_texts.json', index=False)

In [None]:
!cp "/content/word2vec_embedded_with_text.csv" "/content/drive/MyDrive/Lexuz Project/Embeddings/Word2Vec/Embedded Data"
!cp "/content/word2vec_embedded_with_texts.json" "/content/drive/MyDrive/Lexuz Project/Embeddings/Word2Vec/Embedded Data"

In [None]:
df_word_2_vec = df_word_2_vec[['embeddings', 'label']]

In [None]:
df_word_2_vec.to_csv('word2vec_embedded.csv', index=False)
df_word_2_vec.to_json('word2vec_embedding.json', index=False)

In [None]:
!cp "/content/word2vec_embedded.csv" "/content/drive/MyDrive/Lexuz Project/Embeddings/Word2Vec/Embedded Data"
!cp "/content/word2vec_embedding.json" "/content/drive/MyDrive/Lexuz Project/Embeddings/Word2Vec/Embedded Data"

In [None]:
def prep_model(df, batch_size):
    X = np.array(df['embeddings'].tolist())
    y = np.array(df['label'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
    test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

In [None]:
train_loader, test_loader = prep_model(df_word_2_vec, 32)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = df_word_2_vec['label'].nunique()
input_dim = np.array(df_word_2_vec['embeddings'][0]).shape[0]

In [None]:
def train_model(train_loader, model, criterion, optimizer, scheduler, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        scheduler.step(epoch_loss)
        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.6f}")

    print("Training Complete")

In [None]:
def evaluate_model(model, test_loader, criterion, df):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0.0
    incorrect_predictions = []

    with torch.no_grad():
        for i, (inputs, labels) in enumerate(test_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    avg_loss = total_loss / total
    print(f"Test Accuracy: {accuracy:.2f}%, Test Loss: {avg_loss:.4f}")

    return accuracy, avg_loss, incorrect_predictions

In [None]:
class MulticlassModel1(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MulticlassModel1, self).__init__()
        self.layer1 = nn.Linear(input_size, 512)
        self.layer2 = nn.Linear(512, 1024)
        self.layer3 = nn.Linear(1024, 512)
        self.output = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.4)
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.batch_norm2 = nn.BatchNorm1d(1024)

    def forward(self, x):
        x = self.relu(self.batch_norm1(self.layer1(x)))
        x = self.dropout(x)
        x = self.relu(self.batch_norm2(self.layer2(x)))
        x = self.dropout(x)
        x = self.relu(self.layer3(x))
        x = self.dropout(x)
        x = self.output(x)
        return x

In [None]:
model = MulticlassModel1(input_size=input_dim, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

In [None]:
train_model(train_loader, model, criterion, optimizer,scheduler, num_epochs=100)
accuracy, avg_loss, incorrect_predictions = evaluate_model(model, test_loader, criterion, df_word_2_vec)

Epoch 1/100, Loss: 1.912522
Epoch 11/100, Loss: 1.540153
Epoch 21/100, Loss: 1.454182
Epoch 31/100, Loss: 1.403857
Epoch 41/100, Loss: 1.368512
Epoch 51/100, Loss: 1.338873
Epoch 61/100, Loss: 1.320523
Epoch 71/100, Loss: 1.301454
Epoch 81/100, Loss: 1.225904
Epoch 91/100, Loss: 1.207040
Training Complete
Test Accuracy: 50.48%, Test Loss: 1.5216


In [None]:
from google.colab import drive
drive.mount('/content/drive')
torch.save(model.state_dict(), '/content/drive/MyDrive/Lexuz Project/Model/Word2Vec_model/model1.pth')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
