In [None]:
from dotenv import load_dotenv
load_dotenv()
 




In [3]:
# take url as input and verify it
import re



def is_valid_reddit_user_url(url: str):

    pattern = r"^https?://(www\.)?reddit\.com/user/[A-Za-z0-9_-]+/?$"
    return re.match(pattern, url.strip()) is not None

def extract_username_from_url(url: str):

    if is_valid_reddit_user_url(url):
        return url.strip("/").split("/")[-1]
    return None

input_url = input("Enter Url")
url = is_valid_reddit_user_url(input_url)

In [4]:
# data extraction using webbaseloader

from langchain_community.document_loaders import WebBaseLoader
import bs4
from typing import List


if url:
    loader = WebBaseLoader(web_path=(url,))
    data = loader.load()

In [54]:
#Split the data into chunks

from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=0)
docs = splitter.split_documents(data)

In [None]:
#convert the the data into vectoredb

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embedding = HuggingFaceEmbeddings(model="all-MiniLM-L6-V2")

vectorestore = Chroma.from_documents(documents=docs, embedding=embedding)

In [None]:
#retriever to access database

retriever=vectorestore.as_retriever(search_kwargs={"k":10})

In [None]:
#Prompt 

system_prompt = """ 

You aree a helpful AI that builds detailed user personas from reddit data.
The information you retrieve includes Reddit comments written by a specific user.
Based on this content, extrac:
- Personal Information
- Motivations
- Personality
- Behaviour and Habits
- Goals and needs
- Frustrations

For each characteristic in the user persona, the script also “cites” the
comments it used to extract the specific user persona information.

Use a reddit text as evidence for each characteristic. write the persona in structured format like this:
[User Persona]
- Personal Information : It includes Age, Occupation, Status, Location, Tier, Archetype.. etc.
- Motivations : it includes convenience, speed, wellness, preferences, comfort, dietary needs..etc eith percentage.
- Personality : it includes introvert or extrovert, intuition or sensing,feeling or thinking, perceiving or judging,..etc with percentage.
- Behaviour and Habits : describe with 3 - 4 bullet points (cite the comment)
- Goals and needs : describe with 3 - 4 bullet points (cite the comment)
- Frustrations : describe with 3 - 4 bullet points (cite the comment)


"""

In [None]:
# Build and store the persona in the response variable



from langchain_core.prompts import ChatPromptTemplate
import os
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA



prompt = ChatPromptTemplate.from_messages([
    ("system",system_prompt ),
    ("user", "Generate a complete user persona using the retrieved Reddit data.")
])





groq_api_key = os.getenv("GROQ_API_KEY")
llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)

rag_chain = RetrievalQA.from_chain_type(
    llm = llm,
    retriever=retriever,
    verbose=True
)

response = rag_chain.run("Generate a user persona.")


In [None]:
#write the response into txt file


with open("persona.txt",'w',encoding='utf-8') as f:
    f.write(response)