## Import libraries and setting up Azure Open AI and Langchain

In [1]:
import os
import sys
from datetime import datetime, timezone
from random import randint
import openai
from langchain_openai import AzureOpenAI
from langchain_openai import AzureChatOpenAI
from dotenv import load_dotenv
load_dotenv()
from langchain.chains import LLMChain
import requests
import concurrent.futures
import pandas as pd
import json
import time
from langchain import PromptTemplate
from langchain.chains import LLMChain
import concurrent.futures


In [2]:
endpoint = os.getenv("ENDPOINT", default=None)
key = os.getenv("KEY", default=None)
deployment_name = os.getenv("DEPLOYMENT_NAME", default=None)
model_preview = "2023-06-01-preview"
 

In [None]:
llm = AzureChatOpenAI(
    deployment_name=deployment_name,
    api_key=key,  
    api_version=model_preview,
    azure_endpoint = endpoint

)
print(llm)

## Data exploration

In [4]:
path_to_json_data = "data/feedbacks.json"

In [5]:

def get_random_rows(df, n):
    # Take n random rows from DataFrame
    random_rows = df.sample(n)
    return random_rows


def create_dataframe_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    # Load JSON data
    
    # Extract feedbacks
    feedbacks = data.get('feedbacks', {})
    
    # Initialize lists to store data
    ids = []
    feedback_texts = []

    # Iterate over feedbacks and categories simultaneously
    for i, (key, feedback) in enumerate(feedbacks.items()):
        # Extract feedback id and text
        feedback_id = feedback.get('id')
        feedback_text = feedback.get('feedback')
        
        # Append data to lists
        ids.append(feedback_id)
        feedback_texts.append(feedback_text)
    
    # Create DataFrame
    df = pd.DataFrame({
        'id': ids,
        'feedback': feedback_texts
    })
    return df



In [None]:

df = create_dataframe_from_json(path_to_json_data)
df = get_random_rows(df, 150)
print("df", df)

In [7]:
def extract_categories_from_json_file(json_file_path):
    with open(json_file_path) as f:
        json_data = f.read()

    json_dict = json.loads(json_data)

    categories = json_dict['categories']
    return categories

In [8]:
def unique_items_in_a_list(my_list):
    new_list = []
    for item in my_list:
        sub_items = item.split(',')
        for sub_item in sub_items:
            if sub_item not in new_list:
                new_list.append(sub_item)
    return new_list

In [None]:
# Get categories from the data:

# Example usage
categories = extract_categories_from_json_file(path_to_json_data)
print(categories)
print(len(categories))

# Get the list of unique items
unique_categories = unique_items_in_a_list(categories)
print(unique_categories)
print(len(unique_categories))

In [10]:
start_time = time.time()
unique_categories = unique_items_in_a_list(categories)
end_time = time.time()
processing_time = end_time - start_time
print("Processing time:", processing_time, "seconds")


Processing time: 0.0038220882415771484 seconds


### Sentiment Analysis

In [None]:
sentiment_classification_prompt = """
Act as a highly intelligent feedback analyser and classify the given feedbacks text into one of the following sentiments only 1. positive 2.negative 3.neutral 4. other
Do not code. Return only one word answer with only the sentiment that the given feedback text belongs to
feedback: {feedback}

"""


def create_classification_prompt() -> PromptTemplate:
    """
    Create a prompt template for LLM

    Returns
    -------
    PromptTemplate
        Prompt template in the LangChain format
    """

    # build the template
    llm_prompt = PromptTemplate(
        input_variables=["feedback"], template=sentiment_classification_prompt
    )
    return llm_prompt

llm_prompt = create_classification_prompt()
print(llm_prompt )
llm_chain = LLMChain(llm=llm, prompt=llm_prompt, output_key="response", verbose=False, memory=None)
print(llm_chain)

In [None]:
predicted_sentiment  = llm_chain.run( df.iloc[132]['feedback'])
predicted_sentiment 

In [13]:

def add_sentiment_column(df):
    # Add empty column 'sentiment'
    df = df.assign(sentiment='')
    return df

# make the prediction and add it back to the datarame
def add_column_predicted_labels(df, llm_chain):
    #df = df.dropna()
    for row in df.itertuples():
        df.at[row.Index, "sentiment"] = llm_chain.run(row.feedback).lower() 
    return df

In [None]:
df

In [None]:
# create a ThreadPoolExecutor instance with 4 worker threads
start_time = time.time()

executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)

# define a function to make the LLM request
def make_llm_request_lower(llm_chain, feedback):
    return llm_chain.run(feedback).lower()

# iterate over your dataframe and submit llm requests to the executor
futures = []
for row in df.itertuples():
    future = executor.submit(make_llm_request_lower, llm_chain, row.feedback)
    futures.append(future)

# wait for all requests to complete and collect the results
results = []
for future in concurrent.futures.as_completed(futures):
    result = future.result()
    results.append(result)

# add the predicted labels to the dataframe
df["sentiment"] = results


end_time = time.time()
processing_time = end_time - start_time
print("Processing time:", processing_time, "seconds")

In [None]:
df

### Category classification

In [17]:

category_classification_prompt= """
Act as a highly intelligent feedback classifier and classify the given feedback text into one or more than of the following categories:
<feedback_categories>
{categories}
</feedback_categories>.
Do not code. Return only the category names that the given news text belongs to.
<feedback>
{feedback}
</feedback>.
Categories:
"""

In [18]:

def create_category_classification_prompt(categories) -> PromptTemplate:
    """
    Create a prompt template for LLM

    Returns
    -------
    PromptTemplate
        Prompt template in the LangChain format
    """

    # build the template
    llm_prompt = PromptTemplate(
        input_variables=["feedback", "categories"], template=category_classification_prompt
    )
    llm_prompt = llm_prompt.partial(categories=categories)
    return llm_prompt



In [None]:

llm_prompt2 = create_category_classification_prompt(unique_categories)
print(llm_prompt2 )
llm_chain2 = LLMChain(llm=llm, prompt=llm_prompt2, output_key="response", verbose=False, memory=None)
print(llm_chain2)

In [None]:
df.iloc[132]['feedback']

In [None]:
predicted_category  = llm_chain2.run( df.iloc[132]['feedback'])
predicted_category

In [23]:
# create a ThreadPoolExecutor instance with 4 worker threads
start_time = time.time()

executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
def make_llm_request(llm_chain, feedback):
    return llm_chain.run(feedback)

# iterate over your dataframe and submit llm requests to the executor
futures = []
for row in df.itertuples():
    future = executor.submit(make_llm_request, llm_chain2, row.feedback)
    futures.append(future)

# wait for all requests to complete and collect the results
results = []
for future in concurrent.futures.as_completed(futures):
    result = future.result()
    results.append(result)

# add the predicted labels to the dataframe
df["category"] = results


end_time = time.time()
processing_time = end_time - start_time
print("Processing time:", processing_time, "seconds")

In [None]:
unique_sentiment= df['sentiment'].unique()

print(unique_sentiment)

In [None]:
df.to_csv('data/my_dataframe.csv', index=False)