In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
!pip install datasets
from datasets import Dataset




In [2]:
data = pd.read_csv('dataset.csv')
print(data.head())
print(data.shape)

categories = data['Category'].unique()
category_to_id = {category: i for i, category in enumerate(categories)}
data['label'] = data['Category'].map(category_to_id)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
print(f"Train Data Rows: {train_data.shape[0]}")
print(f"Train Data Columns: {train_data.shape[1]}")
print(f"Test Data Rows: {test_data.shape[0]}")
print(f"Test Data Columns: {test_data.shape[1]}")


train_dataset = Dataset.from_pandas(train_data[['Query', 'label']])
test_dataset = Dataset.from_pandas(test_data[['Query', 'label']])


print(train_dataset)
print(test_dataset)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['Query'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])



   Query ID                                              Query  \
0         1          Can I join a trade union at my workplace?   
1         2  How do I start collective bargaining with my e...   
2         3          What protections do trade unions provide?   
3         4  Are there laws governing union formation in my...   
4         5  What are the maximum working hours allowed per...   

                                      Category  
0  Right to Unionize and Collective Bargaining  
1  Right to Unionize and Collective Bargaining  
2  Right to Unionize and Collective Bargaining  
3  Right to Unionize and Collective Bargaining  
4                  Limitation on Working Hours  
(840, 3)
Train Data Rows: 672
Train Data Columns: 4
Test Data Rows: 168
Test Data Columns: 4
Dataset({
    features: ['Query', 'label', '__index_level_0__'],
    num_rows: 672
})
Dataset({
    features: ['Query', 'label', '__index_level_0__'],
    num_rows: 168
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/672 [00:00<?, ? examples/s]

Map:   0%|          | 0/168 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(categories))

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

results = trainer.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.606137
2,No log,0.10113
3,No log,0.032817
4,No log,0.022504
5,No log,0.020156


In [5]:
print("Results:", results)

Results: {'eval_loss': 0.020156225189566612, 'eval_runtime': 4.8349, 'eval_samples_per_second': 34.747, 'eval_steps_per_second': 2.275, 'epoch': 5.0}


In [6]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

def predict(query):

    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = logits.argmax().item()

    return categories[predicted_class]


query = "What are the benefits of a higher minimum wage for workers?"
print("Predicted category:", predict(query))

Predicted category: Minimum Wage


In [7]:
import zipfile
import os


zip_file_path = 'summaries.zip'
extract_folder = '/content/summaries'

os.makedirs(extract_folder, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print(f'Folder extracted to: {extract_folder}')


Folder extracted to: /content/summaries


In [8]:
import os
from transformers import pipeline

!pip install sacremoses

def load_summaries(directory):
    summaries = {}
    categories = [
        "Right to Unionize and Collective Bargaining",
        "Limitation on Working Hours",
        "Protection from Forced Labor",
        "Minimum Wage",
    ]

    for category in categories:
        try:
            file_path = f"{directory}/{category.replace(' ', '_').replace('&', 'and')}.txt"
            with open(file_path, 'r') as file:
                summaries[category] = file.read()
        except FileNotFoundError:
            summaries[category] = "Summary not available."

    return summaries


def predict(query):

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # Tokenize the input query
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the logits and make predictions
    logits = outputs.logits
    predicted_class = logits.argmax().item()

    return categories[predicted_class]


def translate_summary(summary, target_language):
    model_name = f"Helsinki-NLP/opus-mt-en-{target_language}"
    translator = pipeline("translation", model=model_name)

    # Define maximum length for the translation model
    max_length = 512
    # Split the summary into chunks if necessary
    chunks = [summary[i:i+max_length] for i in range(0, len(summary), max_length)]

    translated_chunks = []
    for chunk in chunks:
        try:
            translation = translator(chunk, max_length=max_length)[0]['translation_text']
            translated_chunks.append(translation)
        except Exception as e:
            print(f"Translation error: {e}")
            translated_chunks.append("Translation error.")

    # Combine the translated chunks
    return " ".join(translated_chunks)


def print_summary_for_query(query, summaries):
    # Get the predicted category
    predicted_category = predict(query)

    # Print the summary content
    summary = summaries.get(predicted_category, "Summary not available.")
    print(f"Summary for category '{predicted_category}':")
    print(summary)

    translate = input("\nDo you want to translate this summary? (yes/no): ").strip().lower()
    if translate == "yes":
        language_mapping = {
            "french": "fr",
            "german": "de",
            "spanish": "es",
            "hindi": "hi",
            "japanese": "ja"
        }
        print("Supported languages: French, German, Spanish, Hindi, Japanese")
        selected_language = input("Enter the language you want to translate to: ").strip().lower()
        target_language = language_mapping.get(selected_language)

        if target_language:
            translated_summary = translate_summary(summary, target_language)
            print(f"\nTranslated Summary in {selected_language.capitalize()}:\n")
            print(translated_summary)
        else:
            print("Sorry, the selected language is not supported.")
    else:
        print("Translation skipped.")


# Load summaries from the directory
summaries_folder = '/content/summaries/summaries'  # Replace with your actual path
summaries = load_summaries(summaries_folder)

# Test with an example
query = "What are the benefits of a higher minimum wage for workers?"
print_summary_for_query(query, summaries)


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m36.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Summary for category 'Minimum Wage':
related links

https://labourbureau.gov.in/reports-on-minimum-wages-act-1948
https://labourbureau.gov.in/uploads/pdf/MWA_Report_2019.pdf (Table number-5 , page no.- 86)


Minimum Wages Act, 1948
Background
Initiative Origin: The movement for setting minimum wages began with a resolution by Shri K. G. R. Chaudhary in 1920, advocating for Boards to determine minimum w