## Load model

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

model_name = "Qwen/Qwen2.5-7B-Instruct-AWQ"

# don't use device_map="auto", if model can't fully load on GPU it throws error for AWQ models
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="cuda" if torch.cuda.is_available() else "cpu",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=98304,
    do_sample=True,
    temperature=0.75,
    top_p=0.95,
    top_k=50,
    device_map="cuda" if torch.cuda.is_available() else "cpu",
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.97s/it]
Device set to use cuda


In [8]:
prompt = "tell me about the GSoC.how to apply, how to make proposal etc. explain it in long-response. take as many text as required. "
messages = [
    {"role": "system", "content": "You are a helpful assistant. whose response should be docs compatible. i.e. writable in docx via python-docx. you will only answer in german."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=98304
)

generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# output = pipe(messages, max_length=98304, do_sample=True, temperature=0.75, top_p=0.95, top_k=50)
# response = output[0]['generated_text'][-1]['content']
torch.cuda.empty_cache()


In [9]:
print(response)


Gemeinsame Soziale Campagne (GSoC) ist ein Programm der Google Summer of Code, das darauf abzielt, talentierte Studierende weltweit zu fördern und ihnen die Möglichkeit zu geben, an freier Software-Projekten zu arbeiten. Das Programm wird von Google initiiert und unterstützt, während es von der Free Software Foundation Europe (FSFE) und anderen Organisationen verwaltet wird.

### Was ist GSoC?

GSoC ist eine global ausgerichtete Programmiersommerschule, in der Studenten an freier Software-Projekten arbeiten. Es bietet nicht nur praktische Erfahrungen im Programmieren und Teamarbeit, sondern auch wertvolle Kontakte innerhalb der freien Software-Bewegung. Teilnehmer arbeiten an spezifischen Projekten, die von den Projekten selbst vorgeschlagen werden.

### Wie kann man sich für GSoC bewerben?

Die Bewerbungsprozedur für GSoC umfasst mehrere Schritte:

1. **Projekt-Untersuchung**: Vor der Anmeldung sollte man sich intensiv mit den Projekten vertraut machen, die am GSoC teilnehmen. Dies ka

In [13]:
SYSTEM_PROMPT = "You are a helpful assistant that generates tables based on the provided prompt. " \
            "You will receive a prompt and you need to generate a table in text format. " \
            "The table should be well-structured and easy to read. " \
            "Please ensure that the table is formatted correctly and includes all necessary information. " \
            "The table should be docx compatible, it needs to be in a format that can be easily converted to a docx file. "

In [15]:
system_prompt = (
            "You are a helpful assistant that generates content for tables based on headers and description. "
            "Respond with only the table content in a JSON array format, no explanation or markdown."
        )

In [16]:
system_prompt

'You are a helpful assistant that generates content for tables based on headers and description. Respond with only the table content in a JSON array format, no explanation or markdown.'

In [3]:
import awq

## UnLoad

In [11]:
# del model, tokenizer, pipe
import gc
gc.collect()
torch.cuda.empty_cache()

In [27]:
import subprocess

def convert_docx_to_pdf(docx_path, pdf_path):
    command = ['libreoffice', '--headless', '--convert-to', 'pdf', docx_path, '--outdir', pdf_path]
    subprocess.run(command, check=True)

# Example usage
docx_file = 'data/A_2.docx'
output_dir = 'data/A_2.pdf'
convert_docx_to_pdf(docx_file, output_dir)


convert /home/arpbansal/code/esoc/esoc2025-challenge-ecospecs/data/A_2.docx -> /home/arpbansal/code/esoc/esoc2025-challenge-ecospecs/data/A_2.pdf/A_2.pdf using filter : writer_pdf_Export
