In [1]:
%pip install bitsandbytes



In [2]:
import re
import json
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [3]:

MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

# Load tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True
    )

text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


In [12]:
import pandas as pd
df = pd.read_csv("/content/sample.csv")
# df = pd.read_csv("/content/final_cleaned_content_and_hashtags.csv")  # columns: Content, Hashtags

# Clean hashtags column
df["hashtags"] = df["hashtags"].apply(lambda x: eval(x) if isinstance(x, str) and x.startswith("[") else [])

df.head()

Unnamed: 0,content,hashtags
0,Robert Lerman writes that achieving a healthy...,"[#workbasedlearning, #USA, #apprenticeship, #U..."
1,"National disability advocate Sara Hart Weir, ...",[]
2,Exploring in this months Talent Management & H...,"[#careerplanning, #apprenticeships, #workbased..."
3,I count myself fortunate to have spent time wi...,"[#verifiedresumes, #research, #training]"
4,Online job platforms are a different way of wo...,"[#careers, #coaching, #businessandmanagement, ..."


In [37]:
import warnings
warnings.filterwarnings("ignore")

def infer_theme(post, hashtags, generator):
    # Using atmost first 3 hashtags directly as themes
    if hashtags and isinstance(hashtags, list) and len(hashtags) > 0:
        themes = [tag.lstrip("#").replace("_", " ").strip().lower() for tag in hashtags[:3]]
        return ", ".join(themes)

    # No hashtags – use LLM to infer theme
    prompt = f"""What is the main theme of the following LinkedIn post?

Post: "{post.strip()}"

Respond with only the theme in 1 to 4 words. Do not explain.
"""
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)

    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=10,
        temperature=0.5,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    theme_only = decoded[len(prompt):].strip().split("\n")[0]
    theme = re.sub(r"[^a-zA-Z0-9 _-]", "", theme_only).strip().lower()
    return theme


In [38]:
def create_prompt(post, theme):
    return f"""You are an AI assistant.

Given this LinkedIn post:
---
"{post.strip()}"
---

Theme: {theme}

Generate an instruction that a user might give to an AI to create a similar post."""


In [39]:
import warnings
warnings.filterwarnings("ignore")

def generate_instruction(post, theme):
    prompt = create_prompt(post, theme)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    output = model.generate(
        input_ids,
        max_new_tokens=40,
        do_sample=True,
        temperature=0.8,
        top_p=0.95,
        top_k=50,
        repetition_penalty=1.1,
    )

    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated[len(prompt):].strip()


In [30]:


records = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    post = row["content"]
    hashtags = row["hashtags"]
    if not isinstance(post, str) or len(post.strip()) < 20:
        continue
    theme = infer_theme(post, hashtags, text_generator)
    instruction = generate_instruction(post, theme)
    records.append({
        "instruction": instruction,
        "output": post.strip()
    })

  0%|          | 0/9 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 11%|█         | 1/9 [00:04<00:36,  4.59s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 22%|██▏       | 2/9 [00:10<00:37,  5.34s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 33%|███▎      | 3/9 [00:15<00:30,  5.11s/it]The attention mask and the pad token id were not set. As a consequence, you may observ

In [31]:
# Save to JSONL
with open("instruction_dataset.jsonl", "w", encoding="utf-8") as f:
    for rec in records:
        json.dump(rec, f)
        f.write("\n")

print(f"✅ Saved {len(records)} instruction-output pairs to instruction_dataset.jsonl")

✅ Saved 9 instruction-output pairs to instruction_dataset.jsonl


In [40]:
all_files = ["chunk_1.csv", "chunk_2.csv", "chunk_3.csv", "chunk_4.csv", "chunk_5.csv", "chunk_6.csv",
             "chunk_7.csv", "chunk_8.csv", "chunk_9.csv", "chunk_10.csv", "chunk_11.csv", "chunk_12.csv",
             "chunk_13.csv", "chunk_14.csv", "chunk_15.csv", "chunk_16.csv", ]

In [None]:
import warnings
warnings.filterwarnings("ignore")

ct = 1
for file_chunk in all_files:
    df = pd.read_csv(file_chunk)  # columns: Content, Hashtags

    # Clean hashtags column
    df["hashtags"] = df["hashtags"].apply(lambda x: eval(x) if isinstance(x, str) and x.startswith("[") else [])

    records = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        post = row["content"]
        hashtags = row["hashtags"]
        if not isinstance(post, str) or len(post.strip()) < 20:
            continue
        theme = infer_theme(post, hashtags, text_generator)
        instruction = generate_instruction(post, theme)
        records.append({
            "instruction": instruction,
            "output": post.strip()
        })

    op_file = "ins_dataset_"+str(ct)+".jsonl"
    with open(op_file, "w", encoding="utf-8") as f:
        for rec in records:
            json.dump(rec, f)
            f.write("\n")

    print(f"✅ Saved {len(records)} instruction-output pairs to instruction_dataset.jsonl")
    print("File "+str(ct)+"  completed")
    ct += 1


  0%|          | 0/2000 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/2000 [00:04<2:18:36,  4.16s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/2000 [00:09<2:32:48,  4.59s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/2000 [00:13<2:29:11,  4.48s/it]The attention mask and the pad token id were not set. As a consequen