In [None]:
print('welcome')

welcome


In [3]:
import http.client
import json
import random
import re
from langchain.schema import Document
from dotenv import load_dotenv
import os 


In [4]:
load_dotenv()

api_key=os.getenv('RAPIDAPI_KEY')

INDIAN_CITIES = [
    "Mumbai", "Delhi", "Bangalore", "Hyderabad", "Ahmedabad",
    "Chennai", "Kolkata", "Surat", "Pune", "Jaipur",
    "Lucknow", "Kanpur", "Nagpur", "Visakhapatnam", "Indore",
    "Thane", "Bhopal", "Patna", "Vadodara", "Ghaziabad"
]

def fetch_jobs(query, location="India", results_wanted=5,api_key=api_key):
    conn = http.client.HTTPSConnection("jobs-search-api.p.rapidapi.com")
    
    # If location is "India", use random cities
    if location.lower() == "india":
        # Calculate how many jobs per city (at least 1 city per job)
        jobs_per_city = max(1, results_wanted // len(INDIAN_CITIES))
        all_jobs = []
        
        for city in random.sample(INDIAN_CITIES, min(len(INDIAN_CITIES), results_wanted)):
            payload = json.dumps({
                "search_term": query,
                "location": f"{city}, India",
                "results_wanted": jobs_per_city,
                "site_name": ["indeed", "linkedin", "zip_recruiter", "glassdoor"],
                "distance": 50,
                "job_type": "fulltime",
                "is_remote": False,
                "linkedin_fetch_description": True,
                "hours_old": 72
            })

            headers = {
	'x-rapidapi-key': api_key,
    'x-rapidapi-host': "jobs-search-api.p.rapidapi.com",
    'Content-Type': "application/json"
}

            try:
                conn.request("POST", "/getjobs", body=payload, headers=headers)
                res = conn.getresponse()
                data = res.read().decode("utf-8")
                city_jobs = json.loads(data).get("jobs", [])
                
                # Add city information to each job
                for job in city_jobs:
                    job["searched_location"] = city
                all_jobs.extend(city_jobs)
                
                # Stop if we've collected enough jobs
                if len(all_jobs) >= results_wanted:
                    break
                    
            except Exception as e:
                print(f"Error fetching jobs for {city}: {str(e)}")
                continue
                
        # Trim to exact result count and format
        return [
            {
                "job title": job["title"],
                "company": job["company"],
                "location": job.get("location", "N/A"),
                "searched_city": job.get("searched_location", "India"),
                "description": job["description"]
            }
            for job in all_jobs[:results_wanted]
            if all(key in job for key in ["title", "company", "description"])
        ]
    
    else:
        # Original single-location logic
        payload = json.dumps({
            "search_term": query,
            "location": location,
            "results_wanted": results_wanted,
            "site_name": ["indeed", "linkedin", "zip_recruiter", "glassdoor"],
            "distance": 50,
            "job_type": "fulltime",
            "is_remote": False,
            "linkedin_fetch_description": True,
            "hours_old": 72,
            "show_requirements": True, 
        })

        headers = {
	'x-rapidapi-key': api_key,
    'x-rapidapi-host': "jobs-search-api.p.rapidapi.com",
    'Content-Type': "application/json"
}

        conn.request("POST", "/getjobs", body=payload, headers=headers)
        res = conn.getresponse()
        data = res.read().decode("utf-8")
        job_data = json.loads(data)

        return [
            {
                "job title": job["title"],
                "company": job["company"],
                "location": job.get("location", "N/A"),
                "searched_city": location.split(",")[0].strip(),
                "description": job["description"]
            }
            for job in job_data.get("jobs", [])
            if all(key in job for key in ["title", "company", "description"])
        ]
        
def clean_text(text):
    """Remove excessive newlines and markdown bold syntax"""
    text = re.sub(r'\*\*', '', text)  # Remove **bold** markers
    text = re.sub(r'\n{3,}', '\n\n', text)  # Replace 3+ newlines with double newlines
    return text.strip()

def documentation(job_details):
    content=[] 
    for job in job_details: 
        doc = Document(
                page_content=clean_text(job["description"]),
                metadata={
                    "job_title": job["job title"],
                    "company": job["company"],
                    "location": job["location"],
                    "searched_city": job["searched_city"],
                    
                    "language": "en"
                    }
                )
        content.append(doc)
    return content

In [5]:
inputs='Data Science'



In [6]:
fetch_jobs=fetch_jobs(inputs)

In [7]:
fetch_jobs

[{'job title': 'Consultant (Credit Risk Modelling), Data Science & Analytics',
  'company': 'TransUnion',
  'location': 'Pune',
  'searched_city': 'Pune',
  'description': 'TransUnion\'s Job Applicant Privacy Notice\n\n\n**What We\'ll Bring:**\n\nThis position is responsible for supporting the development of credit risk management and business intelligence analytic solutions through consulting engagements and research serving TransUnion’s clients.\n**What You\'ll Bring:**\n\nWhat we’ll bring:\n  \n\n* A work environment that encourages collaboration and innovation. We consistently explore new technologies and tools to be agile.\n* Flexible time off, workplace flexibility, an environment that welcomes continued professional growth through support of tuition reimbursement, conferences and seminars.\n* Our culture encourages our people to hone current skills and build new capabilities while discovering their genius.\n* We provide a modern computing environment based on best\\-in\\-class "

In [8]:
docs=documentation(fetch_jobs)

In [9]:
docs[0]

Document(metadata={'job_title': 'Consultant (Credit Risk Modelling), Data Science & Analytics', 'company': 'TransUnion', 'location': 'Pune', 'searched_city': 'Pune', 'language': 'en'}, page_content='TransUnion\'s Job Applicant Privacy Notice\n\nWhat We\'ll Bring:\n\nThis position is responsible for supporting the development of credit risk management and business intelligence analytic solutions through consulting engagements and research serving TransUnion’s clients.\nWhat You\'ll Bring:\n\nWhat we’ll bring:\n  \n\n* A work environment that encourages collaboration and innovation. We consistently explore new technologies and tools to be agile.\n* Flexible time off, workplace flexibility, an environment that welcomes continued professional growth through support of tuition reimbursement, conferences and seminars.\n* Our culture encourages our people to hone current skills and build new capabilities while discovering their genius.\n* We provide a modern computing environment based on bes

In [129]:
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate

llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    
    
)

prompt = ChatPromptTemplate.from_template(
    """Determine if the following text describes a job role. 
    Answer strictly 'Yes' or 'No'.
    
    Text: {text}
    """
)

def is_job_role(text):
    # Format the prompt with the input text
    formatted_prompt = prompt.format(text=text)
    
    
    response = llm.invoke(formatted_prompt)
    
    return response.content


In [11]:
is_job_role(inputs)

'Yes'

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [62]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=200)

text_chunks = text_splitter.split_documents(docs)

In [15]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [17]:
embedding=HuggingFaceEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [63]:
vectore_store=FAISS.from_documents(text_chunks,embeddings)

In [58]:
sample=embeddings.embed_query("Hello World")

In [59]:
len(sample)

768

In [64]:
retriever_sample=vectore_store.as_retriever(search_type='similarity',search_kwargs={'k':3})

In [65]:
retriever_sample.invoke("what are skills required for data science job role ")

[Document(id='013abf22-a78a-4004-ac88-a2972a685b24', metadata={'job_title': 'Data Analyst', 'company': 'Jaipur Rugs', 'location': 'Jaipur, Rajasthan, India', 'searched_city': 'Jaipur', 'language': 'en'}, page_content='• Proficiency with Microsoft Office applications, with expertise in Excel (e.g., pivot tables, advanced functions, formulas, filtering, etc.) and database skills (e.g., SQL)\n \n\n • Ability to collect and synthesize information, making it relevant, understandable, and actionable for key stakeholders\n \n\n • Ability to balance multiple projects with competing deadlines\n \n\n • Generate insights that improve the business through linking various data sources\n \n\n • Strong understanding of Analytics and Visualization techniques'),
 Document(id='dde3b1e9-dc9d-43e1-a53f-a54e1db062f0', metadata={'job_title': 'Data Analyst', 'company': 'Jaipur Rugs', 'location': 'Jaipur, Rajasthan, India', 'searched_city': 'Jaipur', 'language': 'en'}, page_content='• Working knowledge in dat

In [66]:
vectore_store.save_local('job_vector_db')

In [120]:
system_prompt = ("""You are a friendly and knowledgeable AI career mentor.  Your role is to provide insightful and helpful career
        advice to users based on real-world job market data. You will use information on job skills, and required qualifications to 
        provide assistance. If you don't know the answer, provide alternative options and be honest about what you don't know.

        Instructions:
        1.  Carefully analyze the context provided, which contains relevant job descriptions and extracted skills.
        2.  Based on the context, answer the user's question in a clear, concise, and easy-to-understand manner.
        3.  If the query cannot be accurately answered based on the context, admit that you lack sufficient information and suggest 
        rephrasing the query or providing more details.
        4.  Avoid making up information or providing speculative answers.

        Context: {context}
        """)
        

        

In [121]:
prompt1=ChatPromptTemplate.from_messages(
    [
        ('system',system_prompt), 
        ('human',"{input}"),
    ]
)

In [35]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain,create_history_aware_retriever

In [122]:
document_chain=create_stuff_documents_chain(llm,prompt1)
retriever=vectore_store.as_retriever(search_type='similarity',search_kwargs={'k':3})
retriever_chain=create_retrieval_chain(retriever,document_chain)


In [123]:
inputs2='How can I transition from a data analyst to a data scientist role?'

In [124]:
response=retriever_chain.invoke({'input':inputs2})

In [125]:
print(response['answer'])

Transitioning from a data analyst to a data scientist role requires a combination of skills, experience, and dedication. Based on the job brief provided, here are some key takeaways to help you make this transition:

1. **Develop advanced analytical skills**: As a data analyst, you likely have a strong foundation in data analysis. To become a data scientist, you'll need to develop skills in machine learning, programming languages like Python, R, or SQL, and experience with data manipulation and visualization tools.
2. **Gain experience with machine learning techniques**: The job brief mentions specific machine learning algorithms like Classification, Regression, Clustering, and Decision Trees. Familiarize yourself with these techniques and practice implementing them using popular libraries like scikit-learn or TensorFlow.
3. **Improve your programming skills**: Data scientists need to be proficient in programming languages like Python, R, or SQL. Focus on developing your skills in one 

In [126]:
input2='What is malaria?'
response2=retriever_chain.invoke({'input':input2})
print(response2['answer'])

I don't have any information about malaria in the provided context, which appears to be a job description for a Consultant, Data Science and Analytics position at TransUnion. The context discusses roles and responsibilities related to mobile app development, team management, and data science, but it does not mention malaria.

If you're looking for information about malaria, I suggest searching for it on a reliable health or medical website, such as the World Health Organization (WHO) or the Centers for Disease Control and Prevention (CDC). They should have accurate and up-to-date information about malaria, its causes, symptoms, treatment, and prevention.


In [102]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory,InMemoryChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import MessagesPlaceholder

In [127]:
contextualize_q_system_prompt  = (
    "Given a chat history and the latest user question which might reference context in the chat history,"
    "Formulate a standalone query which can be understood without the chat history."
    "Do NOT answer the question, just reformulate it if needed and otherwise return as it is."
)

In [128]:
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system",contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ('human',"{input}"),
    ]
)

In [130]:
history_aware_retriever = create_history_aware_retriever(llm,retriever,contextualize_q_prompt)


In [131]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [132]:
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [133]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

question = "What are the key skills I need to become a Data Scientist in the current job market?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

second_question = "Okay, that's helpful. You mentioned Python. Which specific Python libraries are most in-demand for Data Science roles right now?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

Based on the context provided, the following Python libraries are highly in-demand for Data Science roles:

1. **NumPy**: The NumPy library is a fundamental library for numerical computing in Python, and is widely used in Data Science for tasks such as data manipulation and analysis.
2. **Pandas**: The Pandas library is a powerful library for data manipulation and analysis, and is widely used in Data Science for tasks such as data cleaning, filtering, and grouping.
3. **Matplotlib** and **Seaborn**: These libraries are widely used for data visualization, and are essential for creating informative and engaging visualizations.
4. **Scikit-learn**: The Scikit-learn library is a widely used library for machine learning, and provides a wide range of algorithms for tasks such as classification, regression, clustering, and more.
5. **TensorFlow** or **PyTorch**: These libraries are widely used for deep learning, and provide a wide range of tools and frameworks for building and training neural

In [134]:
chat_history

[HumanMessage(content='What are the key skills I need to become a Data Scientist in the current job market?', additional_kwargs={}, response_metadata={}),
 AIMessage(content="Based on the context provided, to become a Data Scientist in the current job market, you should possess the following key skills:\n\n1. **Data Management and Analytics**: Working knowledge of data management and analytics platforms such as SQL Server, SSIS, Power BI, and Google Analytics.\n2. **Statistical Analysis and Machine Learning**: Demonstrated ability in statistical analysis, predictive modeling, and machine learning techniques, including algorithms like Classification, Regression, Clustering, Feature Engineering, Decision Trees, and Gradient Boosting.\n3. **Data Visualization and Communication**: Ability to communicate trends and insights clearly and concisely to stakeholders at all levels of the organization, using techniques such as data visualization and presentation skills.\n4. **Programming Skills**:

In [135]:
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [136]:
conversational_rag_chain.invoke(
    {"input": "What are the key skills I need to become a Data Scientist in the current job market?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

"Based on the context provided, to become a Data Scientist in the current job market, you'll need to possess a combination of technical, business, and soft skills. Here are the key skills required:\n\n**Technical Skills:**\n\n1. **Programming skills**: Proficiency in languages such as Python, R, SQL, and Java.\n2. **Data management and analytics**: Experience with data management and analytics platforms like SQL Server, SSIS, Power BI, and Google Analytics.\n3. **Machine learning and statistical analysis**: Knowledge of machine learning techniques, statistical analysis, and predictive modeling.\n4. **Data visualization**: Familiarity with data visualization tools and techniques to effectively communicate insights.\n5. **Database skills**: Understanding of database concepts and experience with database management systems.\n\n**Business and Quantitative Skills:**\n\n1. **Quantitative field**: A degree in a quantitative field such as statistics, applied mathematics, financial mathematics,

In [137]:
store

{'abc123': InMemoryChatMessageHistory(messages=[HumanMessage(content='What are the key skills I need to become a Data Scientist in the current job market?', additional_kwargs={}, response_metadata={}), AIMessage(content="Based on the context provided, to become a Data Scientist in the current job market, you'll need to possess a combination of technical, business, and soft skills. Here are the key skills required:\n\n**Technical Skills:**\n\n1. **Programming skills**: Proficiency in languages such as Python, R, SQL, and Java.\n2. **Data management and analytics**: Experience with data management and analytics platforms like SQL Server, SSIS, Power BI, and Google Analytics.\n3. **Machine learning and statistical analysis**: Knowledge of machine learning techniques, statistical analysis, and predictive modeling.\n4. **Data visualization**: Familiarity with data visualization tools and techniques to effectively communicate insights.\n5. **Database skills**: Understanding of database conce

In [138]:
conversational_rag_chain.invoke(
    {"input": "Okay, that's helpful. You mentioned Python. Which specific Python libraries are most in-demand for Data Science roles right now?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'Based on the context provided, the following Python libraries are in high demand for Data Science roles:\n\n1. **NumPy**: The NumPy library is a fundamental library for numerical computing in Python, and is widely used in Data Science for tasks such as data manipulation and analysis.\n2. **Pandas**: The Pandas library is a powerful library for data manipulation and analysis, and is widely used in Data Science for tasks such as data cleaning, filtering, and grouping.\n3. **Scikit-learn**: The Scikit-learn library is a popular library for machine learning in Python, and is widely used in Data Science for tasks such as classification, regression, clustering, and model selection.\n4. **TensorFlow** or **PyTorch**: Both TensorFlow and PyTorch are popular deep learning libraries in Python, and are widely used in Data Science for tasks such as building and training neural networks.\n5. **Matplotlib** and/or **Seaborn**: Both Matplotlib and Seaborn are popular data visualization libraries in 

In [2]:
import spacy


In [3]:
nlp=spacy.load('en_core_web_sm')