# Chapter 09: Working with Unstructured Data

In today's data-driven world, companies are inundated with vast amounts of unstructured data, including documents, slides, emails, and more. Transforming this unstructured data into structured formats is crucial for unlocking its full potential. Leveraging LLMs enables businesses to efficiently convert unstructured information into datasets that can be stored in databases, CSV files, and other structured formats. This transformation not only facilitates data management but also allows for the training of machine learning models on the structured datasets. Ultimately, structured data can be seamlessly integrated into various workflows, functions, and processes, enabling more informed decision-making across the organization.

In [None]:
import pandas as pd
from pathlib import Path
from pydantic import BaseModel, Field
from sklearn.metrics import accuracy_score
from language_models.models.llm import OpenAILanguageModel
from language_models.agent import (
    Agent,
    Workflow,
    WorkflowLLMStep,
    OutputType,
    PromptingStrategy,
)
from language_models.proxy_client import ProxyClient
from language_models.settings import settings

In [None]:
proxy_client = ProxyClient(
    client_id=settings.CLIENT_ID,
    client_secret=settings.CLIENT_SECRET,
    auth_url=settings.AUTH_URL,
    api_base=settings.API_BASE,
)

## Sentiment Analysis

For sentiment analysis of tweets, we can automate the process using an LLM to classify sentiment and store its reasoning. Instead of a chat-based application where users manually input tweets, the LLM can analyze tweets in bulk, classify sentiment, and record reasoning. Additionally, the LLM can extract properties such as tagged users and hashtags, providing a comprehensive dataset.

In [None]:
df_tweets = pd.read_csv("./assets/datasets/tweets.csv.gz", compression="gzip", encoding="latin-1", names=["sentiment", "id", "date", "query", "user", "tweet"])
df_tweets = df_tweets.dropna()
df_tweets = df_tweets.where(df_tweets.sentiment != 2)
df_tweets["sentiment"] = df_tweets["sentiment"].map({4: 1, 0: 0})
df_tweets_sampled = df_tweets.sample(20)
df_tweets_sampled.head()

In [None]:
system_prompt = """Take the following tweet and determine the sentiment of the review.

Respond with 1 (positive) or 0 (negative).

If you don't receive a tweet, respond with -1.

Additionally, provide the reason for your choice and extract all mentioned users."""

llm = OpenAILanguageModel(
    proxy_client=proxy_client,
    model='gpt-4',
    max_tokens=250,
    temperature=0.2,
)

class Tweet(BaseModel):
    sentiment: int = Field(description="The sentiment of the tweet")
    reason: str = Field(description="The reason why you chose the sentiment")
    tagged: list[str] = Field(description="A list of people that are tagged, e.g. @user")

sentiment_analysis_agent = Agent.create(
    llm=llm,
    system_prompt=system_prompt,
    prompt="Tweet:\n{tweet}",
    prompt_variables=["tweet"],
    output_type=OutputType.OBJECT,
    output_schema=Tweet,
    prompting_strategy=PromptingStrategy.SINGLE_COMPLETION,
    verbose=True,
)

In [None]:
def classify_sentiment(row) -> pd.Series:
    output = sentiment_analysis_agent.invoke({"tweet": row.tweet})
    sentiment = output.final_answer.sentiment or 0
    reason = output.final_answer.reason or ""
    tagged = output.final_answer.tagged or []
    sentiment_analysis_agent.chat.reset()
    return pd.Series([sentiment, reason, tagged], index=["prediction", "reason", "tagged"])

In [6]:
df_tweets_sampled[["prediction", "reason", "tagged"]] = df_tweets_sampled.apply(classify_sentiment, axis=1)

KeyboardInterrupt: 

In [None]:
df_tweets_sampled.head(20)

In [None]:
print(f"Accuracy: {accuracy_score(df_tweets_sampled.sentiment, df_tweets_sampled.prediction)}")

## Comparing Jobs



In [None]:
def get_job(path: str) -> str:
    with open(path, "r", encoding="utf-8") as file:
        content = file.read()
        return content

job1 = get_job("./assets/datasets/jobs/ELECTRICAL ENGINEERING ASSOCIATE 7525 093016 REV 100416.txt")
job2 = get_job("./assets/datasets/jobs/ELECTRICAL MECHANIC 3841 012017.txt")

In [None]:
system_prompt = "Take the following job and extract data about the job"

llm = OpenAILanguageModel(
    proxy_client=proxy_client,
    model='gpt-4',
    max_tokens=500,
    temperature=0.2,
)

class Job(BaseModel):
    title: str = Field(description="The job title.")
    duties: str = Field(description="The duties of the job.")
    salary: list[str] = Field(description="A list of salary ranges. Format: 'min salary to max salary'.")

job_agent1 = Agent.create(
    llm=llm,
    system_prompt=system_prompt,
    prompt="{job1}",
    prompt_variables=["job1"],
    output_type=OutputType.OBJECT,
    output_schema=Job,
    prompting_strategy=PromptingStrategy.SINGLE_COMPLETION,
    verbose=True,
)

job_agent2 = Agent.create(
    llm=llm,
    system_prompt=system_prompt,
    prompt="{job2}",
    prompt_variables=["job2"],
    output_type=OutputType.OBJECT,
    output_schema=Job,
    prompting_strategy=PromptingStrategy.SINGLE_COMPLETION,
    verbose=True,
)

system_prompt = "Take the following 2 job descriptions and respond with the similarities and differences of the jobs."

task_prompt = """Compare the 2 given job descriptions:

Job 1:
Job title: {job1.title}
Job duties:
{job1.duties}
Salary:
{job1.salary}

Job 2:
Job title: {job2.title}
Job duties:
{job2.duties}
Salary:
{job2.salary}"""

class JobComparison(BaseModel):
    similarities: str = Field(description="The job similarities.")
    differences: str = Field(description="The job differences.")

job_comparison_agent = Agent.create(
    llm=llm,
    system_prompt=system_prompt,
    prompt=task_prompt,
    prompt_variables=["job1", "job2"],
    output_type=OutputType.OBJECT,
    output_schema=JobComparison,
    prompting_strategy=PromptingStrategy.SINGLE_COMPLETION,
    verbose=True,
)

In [None]:
class CompareJobs(BaseModel):
    job1: str = Field(description="The first job")
    job2: str = Field(description="The second job")

workflow = Workflow(
    name="Compare Jobs",
    description="Allows you to compare key information of 2 jobs",
    inputs=CompareJobs,
    output="job_comparison",
    steps=[
        WorkflowLLMStep(name="job1", agent=job_agent1),
        WorkflowLLMStep(name="job2", agent=job_agent2),
        WorkflowLLMStep(name="job_comparison", agent=job_comparison_agent),
    ],
)

In [None]:
output = workflow.invoke({"job1": job1, "job2": job2})

In [None]:
print(output.output.similarities)

In [None]:
print(output.output.differences)

## Transforming Unstructured Data into Structured Formats

In [None]:
path = Path("./assets/datasets/jobs")
filenames = [file.name for file in path.iterdir() if file.is_file()]
jobs = []
for filename in filenames:
    file_path = path / filename
    with open(file_path, "r", encoding="utf-8", errors="replace") as file:
        content = file.read()
        jobs.append(content)

In [None]:
system_prompt = """Take the following job and extract data about the job.

Respond with the job information:
- job title: title of the job.
- job class no: class number as an integer.
- job duties: duties of the job.
- open date: when the position was created. Use DD-MM-YYYY.
- salary: the salary ranges.
- deadline: when the application deadline is. Use DD-MM-YYYY.
- application form: online or email or fax.
- where to apply: url or location."""

llm = OpenAILanguageModel(
    proxy_client=proxy_client,
    model='gpt-4-32k',
    max_tokens=1000,
    temperature=0.2,
)

class Salary(BaseModel):
    description: str = Field(description="The description of the level")
    min_salary: float = Field(description="The minimum salary for this position or level")
    max_salary: float = Field(description="The maximum salary for this position or level")

class Job(BaseModel):
    job_title: str = Field(description="The job title.")
    job_class_no: int = Field(description="The job class number as an integer.")
    job_duties: str = Field(description="The duties of the job.")
    open_date: str = Field(description="When the position was opened. Format: DD-MM-YYYY.")
    salary: list[Salary] = Field(description="A list of salary ranges. Format: 'min salary to max salary'.")
    deadline: str = Field(description="The application deadline. Format: DD-MM-YYYY")
    application_form: str = Field(description="The form of the application (e.g. online, fax, email).")
    where_to_apply: str = Field(description="The url to apply at or location to send the fax or email address.")

job_data_agent = Agent.create(
    llm=llm,
    system_prompt=system_prompt,
    prompt="{job}",
    prompt_variables=["job"],
    output_type=OutputType.STRUCT,
    output_schema=Job,
    prompting_strategy=PromptingStrategy.CHAIN_OF_THOUGHT,
    verbose=True,
)

In [None]:
def extract_jobs(jobs: list[str]) -> pd.DataFrame:
    data = []
    for job in jobs:
        output = job_data_agent.invoke({"job": job})
        data.append(output.final_answer)
        job_data_agent.chat.reset()
    return pd.DataFrame(data)

In [None]:
df_jobs = extract_jobs(jobs)

In [None]:
df_jobs.head()

In [None]:
df_jobs.to_csv("./assets/datasets/jobs.csv", index=False)