<a href="https://colab.research.google.com/github/Chandrashekar0123/linkedin-llm-postgen/blob/main/LLM_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1️⃣ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ╔════════════════════════════════════════════════════════════════════════╗
# ║  LinkedIn‑Post‑LLM – Extract post text content from CSV to .txt file   ║
# ╚════════════════════════════════════════════════════════════════════════╝
import os, csv, random, subprocess, pathlib
import pandas as pd

# 2️⃣ Define base and data directories
BASE_DIR = pathlib.Path("/content/drive/MyDrive/linkedin_post_llm")
DATA_DIR = BASE_DIR / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

# 3️⃣ Download CSV if not already downloaded
CSV_URL = "https://raw.githubusercontent.com/mhlieu/LinkedIn-post-analysis/main/LINKEDIN_POSTS_all.csv"
CSV_PATH = DATA_DIR / "posts.csv"
if not CSV_PATH.exists():
    subprocess.run(["curl", "-L", "-o", str(CSV_PATH), CSV_URL], check=True)

# 4️⃣ Load CSV and print column names
df = pd.read_csv(CSV_PATH)
print("Available columns:", list(df.columns))

# 5️⃣ Set correct column name for post content
TEXT_COLUMN = "TEXT"  # ✅ Corrected from "post_content"

if TEXT_COLUMN not in df.columns:
    raise ValueError(f"Column '{TEXT_COLUMN}' not found in CSV. Please check the column names printed above.")

# 6️⃣ Save post contents to TXT file
TXT_PATH = DATA_DIR / "linkedin_posts.txt"
lines = df[TEXT_COLUMN].dropna().astype(str).apply(lambda x: x.strip().replace("\n", ""))
lines = lines[lines != ""]  # Remove empty strings
lines = lines.tolist()
random.shuffle(lines)

with open(TXT_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

# 7️⃣ Preview
print(f"\nSaved {len(lines)} posts → {TXT_PATH}")
print("First 3 lines:\n" + "-"*40)
for preview in lines[:3]:
    print(preview)


In [None]:
!pip install -U transformers datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/494.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/494.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 req

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd
import os

os.environ["WANDB_DISABLED"] = "true"

# Load data
file_path = "/content/drive/MyDrive/linkedin_post_llm/data/linkedin_posts.txt"
with open(file_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

df = pd.DataFrame({"text": lines})
dataset = Dataset.from_pandas(df)

# Load GPT tokenizer and model
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 doesn't have pad_token by default

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/linkedin_post_llm/finetuned_model",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

# Save model + tokenizer
trainer.save_model("/content/drive/MyDrive/linkedin_post_llm/finetuned_model")
tokenizer.save_pretrained("/content/drive/MyDrive/linkedin_post_llm/finetuned_model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


('/content/drive/MyDrive/linkedin_post_llm/finetuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/linkedin_post_llm/finetuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/linkedin_post_llm/finetuned_model/vocab.json',
 '/content/drive/MyDrive/linkedin_post_llm/finetuned_model/merges.txt',
 '/content/drive/MyDrive/linkedin_post_llm/finetuned_model/added_tokens.json',
 '/content/drive/MyDrive/linkedin_post_llm/finetuned_model/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Path to fine-tuned model
model_path = "/content/drive/MyDrive/linkedin_post_llm/finetuned_model"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

In [None]:
import torch

def generate_linkedin_post(prompt, max_length=100, temperature=0.8, top_p=0.95, top_k=50):
    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # Generate text
    output = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )

    # Decode the output
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
from transformers import Trainer

# Reuse existing trainer with eval_dataset
eval_results = trainer.evaluate(eval_dataset=tokenized_dataset)
print(f"Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss'])):.2f}")

Perplexity: 27.53


In [None]:
prompts = [
    "Excited to share my new project on AI: ",
    "Just completed an amazing internship experience at ",
    "Here’s what I learned from my last side project: "
]

for prompt in prompts:
    generated_post = generate_linkedin_post(prompt)
    print("\nPrompt:", prompt)
    print("Generated:", generated_post)


Prompt: Excited to share my new project on AI: 
Generated: Excited to share my new project on AI:  I've been working on it since its launch, and while it's not quite finished yet, I've had some time to write up a couple of quick notes on what I'm working on. The main focus is to make sure that the AI experience is intuitive (which I'm excited about), and that it's able to understand you. In addition, I'm hoping that the AI will help you develop your own personal AI (since I

Prompt: Just completed an amazing internship experience at 
Generated: Just completed an amazing internship experience at vernacular.com with the aim of helping many of the world's most popular writers get their work published. I am now working as a freelance writer and have started writing, and will start working for my next project soon.

Prompt: Here’s what I learned from my last side project: 
Generated: Here’s what I learned from my last side project:  I can't really speak for myself, but I do have some feedb

In [None]:
prompt = "Thrilled to announce"
generated_text = generate_linkedin_post(prompt)
print("🔹 Generated LinkedIn Post:\n", generated_text)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


🔹 Generated LinkedIn Post:
 Thrilled to announce that I’ve been doing this since 2009,”because’it’s actually been a while since I’ve started blogging,”because’it’s actually been a long time since I’ve started blogging,”because’it’s actually been a long time since I’ve started blogging,”because’it’s actually been a long time since I’ve


In [None]:
for i in range(3):
    print(f"📝 Version {i+1}:\n{generate_linkedin_post('Excited to share')} \n")

📝 Version 1:
Excited to share how I got to this point (and how we got there).

The reason I say this is because I'm trying to explain something so simple, so it's hard to read at times. I mean, it's not like my favorite books are all bad, or anything. I'm not going to say what's good or bad about them all, but I am just trying to give you some context.

In fact, I have some of the best stories of 

📝 Version 2:
Excited to share my story and my journey with you today.

I am so happy that you are looking forward to hearing what I have to say.

If you're interested in learning more about this topic, there are some important things to note:

I am not a registered professional. If you are interested in joining me in writing your story, you must first register by the 12th of February 2017 at www.bitch-journal.com.

You must have 

📝 Version 3:
Excited to share my story with you guys!


This is my story of how I started to develop my own website and start a blog. I started on my own and I hav

# **Method 2**

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

prompt = "Career Growth Tip:"
inputs = tokenizer(prompt, return_tensors="pt")

# Create attention mask to avoid warning
attention_mask = torch.ones_like(inputs["input_ids"])

outputs = model.generate(
    inputs["input_ids"],
    attention_mask=attention_mask,
    max_length=inputs["input_ids"].shape[1] + 80,  # generate up to 80 tokens
    do_sample=True,
    top_p=0.9,
    temperature=0.85,
    top_k=50,
    no_repeat_ngram_size=3,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Simple way to add some hashtags (can be made smarter)
hashtags = " #CareerGrowth #Learning #ProfessionalDevelopment #LinkedInTips"

print("Generated LinkedIn Post:")
print(generated_text + hashtags)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generated LinkedIn Post:
Career Growth Tip:

It's not easy to find great teachers, so it's important to know who is the best fit. As a general rule, people in their 20s who are starting out in the workforce are better qualified than people who are in their 30s or 40s. This is especially true for women, who are generally considered to be the "best fit."

To learn more about how #CareerGrowth #Learning #ProfessionalDevelopment #LinkedInTips


In [None]:
# After training
model.save_pretrained("/content/drive/MyDrive/linkedin_post_llm/finetuned_model")
tokenizer.save_pretrained("/content/drive/MyDrive/linkedin_post_llm/finetuned_model")

In [None]:
!pip install streamlit pyngrok transformers



In [None]:
!cp -r /content/drive/MyDrive/linkedin_post_llm /content/

In [None]:
%%writefile app.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load model and tokenizer only once
@st.cache_resource
def load_model():
    model_path = "linkedin_post_llm"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    return tokenizer, model

tokenizer, model = load_model()

st.set_page_config(page_title="LinkedIn Post Generator", layout="centered")
st.title("💼 LinkedIn Post Generator")

user_input = st.text_area("Enter keywords or prompt", height=150)

temperature = st.slider("Creativity (Temperature)", 0.5, 1.5, 1.0)
max_length = st.slider("Max Length", 50, 300, 150)

if st.button("🚀 Generate Post"):
    if not user_input.strip():
        st.warning("Please enter a prompt.")
    else:
        input_ids = tokenizer.encode(user_input, return_tensors="pt")
        with torch.no_grad():
            output = model.generate(
                input_ids,
                max_length=max_length,
                temperature=temperature,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                pad_token_id=tokenizer.eos_token_id
            )
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        st.subheader("📝 Generated LinkedIn Post")
        st.success(generated_text)

st.markdown("---")
st.markdown("Built with ❤️ using Hugging Face + Streamlit")

Overwriting app.py


In [None]:
!ngrok config add-authtoken 2rwq2V6fDFhbDZDmrx5mxxwhBwo_4wPq1yP8imGyUA4Z1xPN4


In [None]:
from pyngrok import ngrok
import time
import os

# Kill any existing Streamlit instances
!pkill streamlit

# Run Streamlit app
!streamlit run app.py &

# Wait for the app to load
time.sleep(5)

# Open ngrok tunnel
public_url = ngrok.connect(8501)
print("🔗 App is live at:", public_url)



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.106.179.79:8501[0m
[0m
