# **Model 1**

## **Libraries**

In [None]:
import pandas as pd
import json
import string
import re
import gdown
from gensim.models import Word2Vec
import multiprocessing
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import ast

In [None]:
import gdown
file_id = '1rr8qaWBra7JtXFd5AcrtB3H147DtQ7VE'
url = f'https://drive.google.com/uc?export=download&id={file_id}'

gdown.download(url, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1rr8qaWBra7JtXFd5AcrtB3H147DtQ7VE
From (redirected): https://drive.google.com/uc?export=download&id=1rr8qaWBra7JtXFd5AcrtB3H147DtQ7VE&confirm=t&uuid=37a284b7-f99c-4e03-8605-56c7e9ce4a05
To: /content/gathered.json
100%|██████████| 382M/382M [00:10<00:00, 37.7MB/s]


'gathered.json'

## **Preprocessing text**

In [None]:
df = pd.read_json('/content/gathered.json')

In [None]:
text_to_remove = 'Hujjatga taklif yuborish Audioni tinglash'

for i in range(len(df.related_texts)):
    for j in range(len(df.related_texts[i])):
        df.related_texts[i][j] = df.related_texts[i][j].replace(text_to_remove, "")

In [None]:
def preprocess(text_list):
    processed_texts = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r'\W+', ' ', text)
        tokens = text.split()
        processed_texts.append(tokens)
    return processed_texts

df.loc[:, 'related_texts'] = df['related_texts'].apply(preprocess)

In [None]:
processed_docs = df['related_texts'].tolist()

In [None]:
flattened_docs = [word for sublist in processed_docs for word in sublist]

## **Word2Vec**

In [None]:
model = Word2Vec(sentences=flattened_docs, vector_size=300, window=5, min_count=1, workers=multiprocessing.cpu_count())

if '<OOV>' not in model.wv:
    model.wv.add_vector('<OOV>', np.zeros(model.vector_size))

model.save("uzbek_law_word2vec.model")

model = Word2Vec.load("uzbek_law_word2vec.model")

## **Preprocessing text**

In [None]:
model = Word2Vec.load("uzbek_law_word2vec.model")

def preprocess_data(df, model):
    def contains_04_0(okoz_list):
        return any('04.0' in item for item in okoz_list)

    def filter_by_length(text_list):
        return [text for text in text_list if len(text) > 4]

    def keep_elements_starting_with_04(text_list):
        return [element for element in text_list if element.startswith('04')]

    def clean_okoz_text(text_list):
        return [text.replace("04.00.00.00 Oila qonunchiligi /", "").strip("[]").strip() for text in text_list]

    def process_okoz_text(text_list):
        return [text.split('/')[0].strip() if '/' in text else text for text in text_list]

    def has_minimum_elements(text_list, min_length=0):
        return len(text_list) > min_length

    def preprocess_text(text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        return text

    def remove_duplicates(text_list):
        return list(set(text_list))

    def remove_semicolons(text_list):
        return [text.replace(';', '') for text in text_list]

    def get_embedding(text, model, oov_token='<OOV>'):
        tokens = text.split()
        embeddings = []
        for token in tokens:
            if token in model.wv:
                embeddings.append(model.wv[token])
            else:
                embeddings.append(model.wv[oov_token])
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(model.vector_size)

    df = df[df['okoz_text'].apply(contains_04_0)]
    df.loc[:, 'okoz_text'] = df['okoz_text'].apply(filter_by_length)
    df.loc[:, 'okoz_text'] = df['okoz_text'].apply(keep_elements_starting_with_04)
    df.loc[:, 'okoz_text'] = df['okoz_text'].apply(clean_okoz_text)
    df.loc[:, 'okoz_text'] = df['okoz_text'].apply(process_okoz_text)
    df = df[df['okoz_text'].apply(lambda x: has_minimum_elements(x))]
    df.loc[:, 'okoz_text'] = df['okoz_text'].apply(remove_duplicates)
    df.loc[:, 'okoz_text'] = df['okoz_text'].apply(remove_semicolons)
    df = df[df['okoz_text'].apply(len) == 1]
    df = df.reset_index(drop=True)
    df.loc[:, 'okoz_text'] = df['okoz_text'].apply(lambda x: ' '.join(x))

    data = [
        {"okoz_text": df.loc[i, 'okoz_text'], "related_texts": related_text}
        for i in range(len(df))
        for related_text in df.loc[i, 'related_texts']
    ]
    json_string = json.dumps(data, ensure_ascii=False, indent=4)
    with open('corrected.json', 'w', encoding='utf-8') as f:
        f.write(json_string)

    df = pd.read_json("corrected.json")

    df.loc[:, 'related_texts'] = df['related_texts'].apply(preprocess_text)

    class_counts = df['okoz_text'].value_counts()

    label_to_numeric = {}
    label_counter = 1

    for label, count in class_counts.items():
        if count < 1000:
            label_to_numeric[label] = 0
        else:
            label_to_numeric[label] = label_counter
            label_counter += 1

    df.loc[:, 'label'] = df['okoz_text'].map(label_to_numeric)

    # Apply the custom Word2Vec model to get embeddings
    df.loc[:, 'embeddings'] = df['related_texts'].apply(lambda text: get_embedding(text, model))

    columns_to_drop = ['related_texts', 'okoz_text']
    df = df.drop(columns=columns_to_drop)

    return df, label_to_numeric

df = pd.read_json("full_okoz.json")
df, label_to_numeric = preprocess_data(df, model)


In [None]:
def prep_model(df, batch_size):
    X = np.array(df['embeddings'].tolist())
    y = np.array(df['label'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
    test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

train_loader, test_loader = prep_model(df, 32)

## **Model Training**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = 6
input_dim = np.array(df['embeddings'][0]).shape[0]

In [None]:
def train_model(train_loader, model, criterion, optimizer, scheduler, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)
        scheduler.step(epoch_loss)
        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.6f}")

    print("Training Complete")

In [None]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0.0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    avg_loss = total_loss / total
    print(f"Test Accuracy: {accuracy:.2f}%, Test Loss: {avg_loss:.4f}")
    return accuracy, avg_loss

## **Model 1 (Accuracy: 87.4%)**

In [None]:
class MulticlassModel1(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MulticlassModel1, self).__init__()
        self.layer1 = nn.Linear(input_size, 512)
        self.layer2 = nn.Linear(512, 1024)
        self.layer3 = nn.Linear(1024, 512)
        self.output = nn.Linear(512, num_classes)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.4)
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.batch_norm2 = nn.BatchNorm1d(1024)

    def forward(self, x):
        x = self.relu(self.batch_norm1(self.layer1(x)))
        x = self.dropout(x)
        x = self.relu(self.batch_norm2(self.layer2(x)))
        x = self.dropout(x)
        x = self.relu(self.layer3(x))
        x = self.dropout(x)
        x = self.output(x)
        return x

In [None]:
model = MulticlassModel1(input_size=input_dim, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

In [None]:
train_model(train_loader, model, criterion, optimizer,scheduler, num_epochs=100)
evaluate_model(model, test_loader, criterion)

Epoch 1/100, Loss: 0.957749
Epoch 11/100, Loss: 0.479379
Epoch 21/100, Loss: 0.379683
Epoch 31/100, Loss: 0.313936
Epoch 41/100, Loss: 0.263371
Epoch 51/100, Loss: 0.253834
Epoch 61/100, Loss: 0.225275
Epoch 71/100, Loss: 0.160433
Epoch 81/100, Loss: 0.150791
Epoch 91/100, Loss: 0.152734
Training Complete
Test Accuracy: 87.40%, Test Loss: 0.5379


(87.40068104426788, 0.537924875691674)

## **Model 2 (Accuracy: 84.9%)**

In [None]:
class MulticlassModel2(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MulticlassModel2, self).__init__()
        self.layer1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.layer2 = nn.Linear(512, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.layer3 = nn.Linear(1024, 2048)
        self.bn3 = nn.BatchNorm1d(2048)
        self.layer4 = nn.Linear(2048, 1024)
        self.bn4 = nn.BatchNorm1d(1024)
        self.layer5 = nn.Linear(1024, 512)
        self.bn5 = nn.BatchNorm1d(512)
        self.output = nn.Linear(512, num_classes)

        self.elu = nn.ELU()
        self.dropout1 = nn.Dropout(p=0.3)
        self.dropout2 = nn.Dropout(p=0.4)

    def forward(self, x):
        x = self.elu(self.bn1(self.layer1(x)))
        x = self.dropout1(x)
        x = self.elu(self.bn2(self.layer2(x)))
        x = self.dropout2(x)
        x = self.elu(self.bn3(self.layer3(x)))
        x = self.dropout2(x)
        x = self.elu(self.bn4(self.layer4(x)))
        x = self.dropout2(x)
        x = self.elu(self.bn5(self.layer5(x)))
        x = self.dropout1(x)
        x = self.output(x)
        return x

In [None]:
model = MulticlassModel2(input_size=input_dim, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [None]:
train_model(train_loader, model, criterion, optimizer,scheduler, num_epochs=31)
evaluate_model(model, test_loader, criterion)

Epoch 1/31, Loss: 0.985254
Epoch 11/31, Loss: 0.594767
Epoch 21/31, Loss: 0.462706
Epoch 31/31, Loss: 0.377728
Training Complete
Test Accuracy: 84.90%, Test Loss: 0.4525


(84.90351872871737, 0.45248965185517576)