# **Installing libraries with its latest version**

In [1]:
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report, confusion_matrix

# **Importing Dataset**

In [2]:
file_path='/content/drive/MyDrive/Colab Notebooks/archive.zip'

In [3]:
df=pd.read_csv(file_path,compression='zip')

In [4]:
df.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


# **Preprocessing the data**

In [5]:
df['case_text_sum'] = df["case_title"] + df["case_text"]

In [6]:
df.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text,case_text_sum
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...,Dr Martens Australia Pty Ltd v Figgins Holding...


In [7]:
df=df.drop(columns=['case_text'])

In [8]:
df=df.rename(columns={'case_text_sum':'case_text'})

In [9]:
df=df.drop(columns=['case_id','case_title'])

In [10]:
df.head()

Unnamed: 0,case_outcome,case_text
0,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...
1,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...
2,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...
3,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...
4,cited,Dr Martens Australia Pty Ltd v Figgins Holding...


In [11]:
df.describe()

Unnamed: 0,case_outcome,case_text
count,24985,24809
unique,10,24764
top,cited,Gudjala People # 2 v Native Title Registrar [2...
freq,12219,5


In [12]:
df.case_outcome.value_counts()

case_outcome
cited            12219
referred to       4384
applied           2448
followed          2256
considered        1712
discussed         1024
distinguished      608
related            113
affirmed           113
approved           108
Name: count, dtype: int64

In [13]:
df.isna().sum()

case_outcome      0
case_text       176
dtype: int64

In [14]:
df = df.fillna('') #filling of null values with blanks

In [15]:
df.isna().sum()

case_outcome    0
case_text       0
dtype: int64

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# **Defining Custom Stopwords**

In [18]:
#defining custom stopwords
custom_stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', "mightn't", 'mustn', 'needn', 'shan', "shan't", 'shouldn', 'wasn', 'weren', 'won']

In [19]:
import re
from nltk.stem import PorterStemmer

def text_preprocessing(text, custom_stopwords):
    if not isinstance(text, str):
        # Handle non-string elements (e.g., return empty string, skip the sample)
        return ""

    # Lowercase the text (if it's a string)
    text = text.lower()

    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)

    #Remove numbers
    text = re.sub(r'\d+','',text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)

    # Tokenize the text (split into words)
    words = text.split()

    # Remove custom stopwords from the tokenized list
    filtered_words = [word for word in words if word not in custom_stopwords]

    # Stemming
    stemmer = PorterStemmer()
    filtered_words = [stemmer.stem(word) for word in filtered_words]

    # Join tokens back into single string
    text = ' '.join(filtered_words)

    return text
df['clean_case_text']=df['case_text'].apply(lambda x: text_preprocessing(x,custom_stopwords))

In [20]:
df.head()

Unnamed: 0,case_outcome,case_text,clean_case_text
0,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,alpin hardwood aust pti ltd v hardi pti ltd no...
1,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,black v lipovac fca alr gener principl govern ...
2,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,colgat palmol co v cusson pti ltd fcr ordinari...
3,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,dai studio pti ltd v bullett creativ pti ltd f...
4,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,dr marten australia pti ltd v figgin hold pti ...


In [21]:
df1=df.drop(columns=['case_text'])

In [22]:
df1.head()

Unnamed: 0,case_outcome,clean_case_text
0,cited,alpin hardwood aust pti ltd v hardi pti ltd no...
1,cited,black v lipovac fca alr gener principl govern ...
2,cited,colgat palmol co v cusson pti ltd fcr ordinari...
3,cited,dai studio pti ltd v bullett creativ pti ltd f...
4,cited,dr marten australia pti ltd v figgin hold pti ...


In [23]:
df1=df1.rename(columns={'clean_case_text':'case_text'})

In [24]:
df1.head()

Unnamed: 0,case_outcome,case_text
0,cited,alpin hardwood aust pti ltd v hardi pti ltd no...
1,cited,black v lipovac fca alr gener principl govern ...
2,cited,colgat palmol co v cusson pti ltd fcr ordinari...
3,cited,dai studio pti ltd v bullett creativ pti ltd f...
4,cited,dr marten australia pti ltd v figgin hold pti ...


# **Train and Test Split**

In [25]:
from sklearn.model_selection import train_test_split

df_train,df_test = train_test_split(df1,test_size=0.2, shuffle=True,random_state=42)

df_train.shape,df_test.shape

((19988, 2), (4997, 2))

In [26]:
df_train['case_outcome'].unique()

array(['applied', 'cited', 'referred to', 'considered', 'followed',
       'discussed', 'distinguished', 'affirmed', 'approved', 'related'],
      dtype=object)

# **Preprocessing the label classes**

In [27]:
# Calculate percentage of data for each category before label encoding
category_counts = df_train['case_outcome'].value_counts()
total_samples = len(df_train)

# Calculate percentage for each category
percentages = {}
for category, count in category_counts.items():
    percentage = (count / total_samples) * 100
    percentages[category] = percentage

# Print percentages
for category, percentage in percentages.items():
    print(f"{category}: {percentage:.2f}%")


cited: 48.84%
referred to: 17.64%
applied: 9.67%
followed: 9.11%
considered: 6.94%
discussed: 4.10%
distinguished: 2.43%
approved: 0.45%
related: 0.43%
affirmed: 0.41%




1.   We observe that the out of 10 classes, 5 classes (Less than 5% of data) have minimal distribution so we exclude the 5 classes having minimal data(Discussed, distinguished,approved,related,affirmed)
2.   These classes which are less than 5% in the total data set would create noise and deviate the model from proper classification.



In [28]:
# Assuming df_train contains your training data
# Calculate the percentage of each category label
label_counts = df_train['case_outcome'].value_counts(normalize=True) * 100

# Filter out labels with less than 5% representation
threshold = 5
labels_to_keep = label_counts[label_counts >= threshold].index

# Filter the training data based on the selected labels
df_train_filtered = df_train[df_train['case_outcome'].isin(labels_to_keep)]

# Optionally, you can also filter the test data using the same labels
df_test_filtered = df_test[df_test['case_outcome'].isin(labels_to_keep)]


In [29]:
df_train_filtered['case_outcome'].value_counts()

case_outcome
cited          9762
referred to    3525
applied        1933
followed       1820
considered     1388
Name: count, dtype: int64

In [30]:
df_test_filtered['case_outcome'].value_counts()

case_outcome
cited          2457
referred to     859
applied         515
followed        436
considered      324
Name: count, dtype: int64

In [31]:
df_train_filtered.isna().sum()

case_outcome    0
case_text       0
dtype: int64

# **Define label encoding**

In [32]:
encoded_dict = {"cited": 0, "referred to": 1, "applied": 2, "followed": 3,
                 "considered": 4, "discussed": 5, 'distinguished': 6,
                 "approved": 7, "related": 8, "affirmed": 9}

In [33]:
# df_train['case_outcome'] = df_train_filtered['case_outcome'].map(encoded_dict)
# df_test['case_outcome'] = df_test_filtered['case_outcome'].map(encoded_dict)

In [34]:
# Map labels to encoded values
df_train_filtered['case_outcome'] = df_train_filtered['case_outcome'].map(encoded_dict)
df_test_filtered['case_outcome'] = df_test_filtered['case_outcome'].map(encoded_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_filtered['case_outcome'] = df_train_filtered['case_outcome'].map(encoded_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_filtered['case_outcome'] = df_test_filtered['case_outcome'].map(encoded_dict)


In [35]:
df_train_filtered['case_outcome'].value_counts()

case_outcome
0    9762
1    3525
2    1933
3    1820
4    1388
Name: count, dtype: int64

In [36]:
df_test_filtered['case_outcome'].value_counts()

case_outcome
0    2457
1     859
2     515
3     436
4     324
Name: count, dtype: int64

In [37]:
# Calculate percentage for each category
percentages = (category_counts / total_samples) * 100

In [39]:
df_test_filtered['case_outcome'].value_counts(normalize=True)

case_outcome
0    0.535178
1    0.187105
2    0.112176
3    0.094968
4    0.070573
Name: proportion, dtype: float64

In [40]:
threshold = 5
labels_to_keep = label_counts[label_counts >= threshold].index

In [41]:
# Filter the training data based on the selected labels
df_train_filtered = df_train[df_train['case_outcome'].isin(labels_to_keep)]
df_test_filtered = df_test[df_test['case_outcome'].isin(labels_to_keep)]

In [42]:
df_test_filtered['case_outcome'].value_counts()

case_outcome
cited          2457
referred to     859
applied         515
followed        436
considered      324
Name: count, dtype: int64

In [43]:
# Calculate percentage of data for each category after filtering
category_counts_filtered = df_train_filtered['case_outcome'].value_counts()
total_samples_filtered = len(df_train_filtered)

In [44]:
# Calculate percentage for each category
percentages_filtered = {}
for category, count in category_counts_filtered.items():
    percentage = (count / total_samples_filtered) * 100
    percentages_filtered[category] = percentage

In [45]:

# Create smaller dataset same distributed as original data based on case_outcome percentage
subset_size = 100
subset_data = pd.DataFrame()
for label, percentage in percentages_filtered.items():
    num_samples = max(1, int((percentage / 100) * subset_size))
    print(f"Label: {label}, Percentage: {percentage}, Num Samples: {num_samples}")
    subset_data = pd.concat([subset_data, df_train_filtered[df_train_filtered['case_outcome'] == label].sample(num_samples)])

Label: cited, Percentage: 52.97373561970914, Num Samples: 52
Label: referred to, Percentage: 19.128500108530496, Num Samples: 19
Label: applied, Percentage: 10.489472541784242, Num Samples: 10
Label: followed, Percentage: 9.876275233340568, Num Samples: 9
Label: considered, Percentage: 7.5320164966355545, Num Samples: 7




1.   As we observe that the data set is distributed based on the percentage of the classes mentioned above.
2.   We create a subset of training and testing data set which is alligned to the distribution of actual dataset



In [46]:
df_test_filtered['case_outcome'].value_counts()

case_outcome
cited          2457
referred to     859
applied         515
followed        436
considered      324
Name: count, dtype: int64

In [47]:
test_data_equally_distributed = pd.DataFrame()
for label, percentage in percentages_filtered.items():
    # Check if the label is present in df_test_filtered
    if label in df_test_filtered['case_outcome'].unique():
        num_samples = max(1, int((percentage / 100) * subset_size))
        print(f"Label: {label}, Percentage: {percentage}, Num Samples: {num_samples}")
        test_data_equally_distributed = pd.concat([test_data_equally_distributed, df_test_filtered[df_test_filtered['case_outcome'] == label].sample(num_samples)])
    else:
        print(f"Label {label} not found in df_test_filtered. Skipped")

Label: cited, Percentage: 52.97373561970914, Num Samples: 52
Label: referred to, Percentage: 19.128500108530496, Num Samples: 19
Label: applied, Percentage: 10.489472541784242, Num Samples: 10
Label: followed, Percentage: 9.876275233340568, Num Samples: 9
Label: considered, Percentage: 7.5320164966355545, Num Samples: 7


In [79]:
df_test_filtered['case_outcome'].value_counts(normalize=True)

case_outcome
cited          0.535178
referred to    0.187105
applied        0.112176
followed       0.094968
considered     0.070573
Name: proportion, dtype: float64

In [48]:
df_test_filtered['case_outcome'].value_counts()

case_outcome
cited          2457
referred to     859
applied         515
followed        436
considered      324
Name: count, dtype: int64

In [81]:
list_label=["cited","referred to","applied","followed","considered "]


**Creating equally distributed test data based on the original data**

In [82]:

# Create equally distributed test data based on the original data
test_data_equally_distributed = pd.DataFrame()
for label, count in category_counts.items():
  # Ensure minimum of 1 sample even if count is less than total_samples/len(category_counts)
  num_samples = max(1, min(count, int(total_samples / len(category_counts))))
  if num_samples > 0:
    if(label in list_label):
      print(num_samples,label)
      sample_data = df_test_filtered[df_test_filtered['case_outcome'] == label].sample(num_samples, replace=True)
      test_data_equally_distributed = pd.concat([test_data_equally_distributed, sample_data])


1998 cited
1998 referred to
1933 applied
1820 followed


**Creating different sample of test data from the original data**

In [83]:
test_data_different_distribution = df_test_filtered.sample(len(df_test_filtered))


In [84]:
subset_data['encoded_outcome'] = subset_data['case_outcome'].map(encoded_dict)
test_data_equally_distributed['encoded_outcome'] = test_data_equally_distributed['case_outcome'].map(encoded_dict)
test_data_different_distribution['encoded_outcome'] = test_data_different_distribution['case_outcome'].map(encoded_dict)

In [85]:
import torch

In [53]:
# # Convert labels to tensors (PyTorch)

# y_train = torch.tensor(df_train_filtered['case_outcome'].tolist())
# y_test = torch.tensor(df_test_filtered['case_outcome'].tolist())


In [86]:
# Load tokenizer and pre-trained model (PyTorch)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)  # 5 for 5 classes


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Tokenize the input**

In [89]:
# Tokenize the input (PyTorch)
max_len = 70
encoding_train = tokenizer(
    text=subset_data.case_text.tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding='max_length',
    return_tensors='pt'
)

# **Tokenize testing data**

In [90]:
encoding_test = tokenizer(
    text=test_data_equally_distributed.case_text.tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding='max_length',
    return_tensors='pt'
)

# **Tokenize different set of test data**

In [91]:
encoding_test_different_distribution = tokenizer(
    text=test_data_different_distribution.case_text.tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding='max_length',
    return_tensors='pt'
)

In [92]:
input_ids_train = encoding_train['input_ids']
attention_mask_train = encoding_train['attention_mask']
input_ids_test = encoding_test['input_ids']
attention_mask_test = encoding_test['attention_mask']
input_ids_test_different_distribution = encoding_test_different_distribution['input_ids']
attention_mask_test_different_distribution = encoding_test_different_distribution['attention_mask']

# **Convert Labels to Tensors(Pytorch)**

In [93]:
# Convert labels to tensors (PyTorch)
y_train = torch.tensor(subset_data['encoded_outcome'].tolist())
y_test_equally_distributed = torch.tensor(test_data_equally_distributed['encoded_outcome'].tolist())
y_test_different_distribution = torch.tensor(test_data_different_distribution['encoded_outcome'].tolist())

# **Sample a very minimal subset of the data for training**

In [94]:
subset_size = min(100, len(y_train))  # Ensure subset_size is not larger than the dataset size
indices = np.random.choice(len(y_train), subset_size, replace=False)
input_ids_train_subset = input_ids_train[indices]
attention_mask_train_subset = attention_mask_train[indices]
y_train_subset = y_train[indices]

# **Define training function**

In [95]:
# Define training function
def train(model, optimizer, train_loader, criterion, epochs=3):
    model.train()  # Set model to training mode
    for epoch in range(epochs):
        for input_ids, attention_mask, labels in train_loader:
            optimizer.zero_grad()  # Clear gradients
            outputs = model(input_ids, attention_mask=attention_mask)  # Forward pass
            loss = criterion(outputs.logits, labels)  # Calculate loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

In [96]:
batch_size = 32


In [97]:
import torch.nn as nn

In [68]:
from torch.utils.data import TensorDataset, DataLoader

# **Train the model with the minimal subset of data**

In [69]:
# Train the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)  # AdamW optimizer with learning rate
batch_size = 8
train_dataset = TensorDataset(input_ids_train, attention_mask_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
epochs = 3
train(model, optimizer, train_loader, criterion, epochs=epochs)  # Train for 3 epochs

In [70]:
for batch in train_loader:
    print(batch)
    break

[tensor([[  101,  8715,  2050,   170, 12207, 24874,  1179,  7033,   191,   185,
          4820, 24611,  9872,  6696,   181,  1204,  1181,   177,  2599,   172,
          1233,  1197,  1218,  4586,  8050, 25685,  2050,  1197,  1260, 21349,
          1336, 24034,  9610,  1179,  1260, 21349, 11166,  4930,  1321,  5318,
          3300,  2519,  1260, 21349,  8693,  2608,   188, 19756,  3818,  1352,
          6848,  2646,  5318,  1538,  1678,  3300,  1260, 21349,  1293,  6348,
          1231, 23403,  5318,  1136,  6848,  2646,  1352,  1538,  1260,   102],
        [  101,  8715,  2050, 13280,  3080,  1403,  1197,  4321, 10182,  6066,
          2149,  7033,   191, 22904,  8613,  1183,  8508,  3848,  1182,   175,
          2599,   175,  1665,  1197,  1321,  3971,  8715,  2050,  1747,  2100,
          5767,  1141,  1289,  2351,  4586,   189,  2047,  7925,  1179,  1185,
          5565,  1197,   184,  1830,  2646,  1403,  1107, 17030,  4035, 18276,
          2047,  1294, 12647,  8031,  1692,  2351,

In [71]:
# Make predictions on the training data
predictions_train = []
true_labels_train = []
with torch.no_grad():
    model.eval()
    for input_ids, attention_mask, labels in train_loader:
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        predictions_train.extend(predictions.tolist())
        true_labels_train.extend(labels.tolist())

# Calculate classification report
class_names = list(encoded_dict.keys())
classification_report_train = classification_report(true_labels_train, predictions_train, target_names=class_names, labels=range(len(class_names)))
print("Classification Report for Training Data:")
print(classification_report_train)


Classification Report for Training Data:
               precision    recall  f1-score   support

        cited       0.54      1.00      0.70        52
  referred to       0.00      0.00      0.00        19
      applied       0.00      0.00      0.00        10
     followed       0.00      0.00      0.00         9
   considered       0.00      0.00      0.00         7
    discussed       0.00      0.00      0.00         0
distinguished       0.00      0.00      0.00         0
     approved       0.00      0.00      0.00         0
      related       0.00      0.00      0.00         0
     affirmed       0.00      0.00      0.00         0

    micro avg       0.54      0.54      0.54        97
    macro avg       0.05      0.10      0.07        97
 weighted avg       0.29      0.54      0.37        97



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **Evaluate the model**

In [72]:
# Evaluate the model
def evaluate_model(model, input_ids, attention_mask, labels):
    model.eval()  # Set model to evaluation mode
    predictions = []
    with torch.no_grad():
        for i in range(0, len(input_ids), batch_size):
            batch_input_ids = input_ids[i:i+batch_size]
            batch_attention_mask = attention_mask[i:i+batch_size]
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
            predictions.extend(torch.argmax(outputs.logits, axis=1).tolist())
    print("Classification Report:")
    print(classification_report(labels.tolist(), predictions))
    print("Confusion Matrix:")
    print(confusion_matrix(labels.tolist(), predictions))

In [73]:
from sklearn.metrics import classification_report, confusion_matrix

In [98]:
evaluate_model(model, input_ids_test, attention_mask_test, y_test_equally_distributed)


Classification Report:
              precision    recall  f1-score   support

           0       0.26      0.91      0.40      1998
           1       0.00      0.00      0.00      1998
           2       0.27      0.10      0.14      1933
           3       0.00      0.00      0.00      1820
           4       0.00      0.00      0.00         0

    accuracy                           0.26      7749
   macro avg       0.10      0.20      0.11      7749
weighted avg       0.13      0.26      0.14      7749

Confusion Matrix:
[[1816    0  170    0   12]
 [1848    0  148    0    2]
 [1743    0  184    0    6]
 [1631    0  189    0    0]
 [   0    0    0    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [99]:
evaluate_model(model, input_ids_test_different_distribution, attention_mask_test_different_distribution, y_test_different_distribution)


Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.91      0.67      2457
           1       0.00      0.00      0.00       859
           2       0.11      0.09      0.10       515
           3       0.00      0.00      0.00       436
           4       0.08      0.01      0.01       324

    accuracy                           0.50      4591
   macro avg       0.15      0.20      0.16      4591
weighted avg       0.31      0.50      0.37      4591

Confusion Matrix:
[[2234    0  205    0   18]
 [ 789    0   68    0    2]
 [ 468    0   45    0    2]
 [ 391    0   45    0    0]
 [ 286    0   36    0    2]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
