In [37]:
import pandas as pd
import json
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # type: ignore
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import tqdm as notebook_tqdm

In [5]:
# Load the chat history JSON file
with open('./data/chat_history.json', 'r') as file:
    chat_data = json.load(file)

In [35]:
chat_data

[{'customer_id': 'C001',
  'chat_history': [{'timestamp': '2024-07-01T10:00:00Z',
    'message': "Hi, I'm looking for a summer dress.",
    'cleaned_text': "hi, i'm looking for a summer dress."},
   {'timestamp': '2024-07-01T10:01:00Z',
    'message': 'Do you have anything in yellow?',
    'cleaned_text': 'do you have anything in yellow?'},
   {'timestamp': '2024-07-01T10:02:00Z',
    'agent_message': 'Hello! Yes, we have a few options. Would you like a casual or formal dress?',
    'cleaned_text': 'hello! yes, we have a few options. would you like a casual or formal dress?'},
   {'timestamp': '2024-07-01T10:03:00Z',
    'message': 'A casual dress, please.',
    'cleaned_text': 'a casual dress, please.'},
   {'timestamp': '2024-07-01T10:04:00Z',
    'agent_message': 'Great! Here are a few yellow casual dresses. [Link to products]',
    'cleaned_text': 'great! here are a few yellow casual dresses. [link to products]'}]},
 {'customer_id': 'C002',
  'chat_history': [{'timestamp': '2024-07

In [8]:
# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.lower()  # Convert to lowercase
    return text

In [14]:
# Sentiment analysis
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

In [10]:
# Adding cleaned text to chat_data 
for customer in chat_data:
    for message in customer['chat_history']:
        if 'message' in message:
            message['cleaned_text'] = clean_text(message['message'])
        elif 'agent_message' in message:
            message['cleaned_text'] = clean_text(message['agent_message'])

In [41]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)



OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct.
401 Client Error. (Request ID: Root=1-668a7e86-7d8e9df6709f07673d4ecc1d;6f3d7f7f-4666-4953-bfe2-96ba35732cb8)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B-Instruct is restricted. You must be authenticated to access it.

# Playground 

In [15]:
# Topic modeling
all_texts = [message['cleaned_text'] for customer in chat_data for message in customer['chat_history'] if 'cleaned_text' in message]


In [16]:
all_texts

["hi, i'm looking for a summer dress.",
 'do you have anything in yellow?',
 'hello! yes, we have a few options. would you like a casual or formal dress?',
 'a casual dress, please.',
 'great! here are a few yellow casual dresses. [link to products]',
 'hello, do you have size  in black jeans?',
 'hi! let me check that for you.',
 'yes, we have size  in stock for black jeans. [link to product]',
 'thank you! can i place an order here?',
 "sure! i'll assist you with placing the order.",
 'hi, can you recommend a jacket for cold weather?',
 'hello! of course. do you have a preference for style or color?',
 'something stylish and in blue, please.',
 'we have several stylish blue jackets. here are some options. [link to products]',
 'these look great, thank you!',
 "hello, i'm looking for a gift for my wife. any suggestions?",
 'hi! sure, can you tell me a bit about her style?',
 'she loves elegant dresses.',
 'we have some beautiful elegant dresses that would make a perfect gift. [link to

In [17]:

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(all_texts)


In [19]:
X

<25x62 sparse matrix of type '<class 'numpy.int64'>'
	with 107 stored elements in Compressed Sparse Row format>

In [20]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

In [31]:
def print_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))



In [29]:
lda.components_[3]

0.20001196670682994

In [34]:
print_topics(lda, vectorizer.get_feature_names_out(), 3)

Topic 0:
thank hello style
Topic 1:
hi summer blue
Topic 2:
yes hello black
Topic 3:
looking hi gift
Topic 4:
dresses link great


In [11]:
message['cleaned_text']

'thank you!'

In [12]:
customer

{'customer_id': 'C005',
 'chat_history': [{'timestamp': '2024-07-05T16:45:00Z',
   'message': 'Hi, do you have any promotions or discounts available?',
   'cleaned_text': 'hi, do you have any promotions or discounts available?'},
  {'timestamp': '2024-07-05T16:46:00Z',
   'agent_message': 'Hello! Yes, we currently have a 20% discount on all summer wear.',
   'cleaned_text': 'hello! yes, we currently have a % discount on all summer wear.'},
  {'timestamp': '2024-07-05T16:47:00Z',
   'message': "That's great! How can I apply the discount?",
   'cleaned_text': "that's great! how can i apply the discount?"},
  {'timestamp': '2024-07-05T16:48:00Z',
   'agent_message': 'You can use the code SUMMER20 at checkout to avail the discount.',
   'cleaned_text': 'you can use the code summer at checkout to avail the discount.'},
  {'timestamp': '2024-07-05T16:49:00Z',
   'message': 'Thank you!',
   'cleaned_text': 'thank you!'}]}

In [13]:
chat_data

[{'customer_id': 'C001',
  'chat_history': [{'timestamp': '2024-07-01T10:00:00Z',
    'message': "Hi, I'm looking for a summer dress.",
    'cleaned_text': "hi, i'm looking for a summer dress."},
   {'timestamp': '2024-07-01T10:01:00Z',
    'message': 'Do you have anything in yellow?',
    'cleaned_text': 'do you have anything in yellow?'},
   {'timestamp': '2024-07-01T10:02:00Z',
    'agent_message': 'Hello! Yes, we have a few options. Would you like a casual or formal dress?',
    'cleaned_text': 'hello! yes, we have a few options. would you like a casual or formal dress?'},
   {'timestamp': '2024-07-01T10:03:00Z',
    'message': 'A casual dress, please.',
    'cleaned_text': 'a casual dress, please.'},
   {'timestamp': '2024-07-01T10:04:00Z',
    'agent_message': 'Great! Here are a few yellow casual dresses. [Link to products]',
    'cleaned_text': 'great! here are a few yellow casual dresses. [link to products]'}]},
 {'customer_id': 'C002',
  'chat_history': [{'timestamp': '2024-07