In [None]:
import pandas as pd
import json
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # type: ignore
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import tqdm as notebook_tqdm

In [None]:
# Load the chat history JSON file
with open('./data/chat_history.json', 'r') as file:
    chat_data = json.load(file)

In [None]:
chat_data

In [None]:
# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.lower()  # Convert to lowercase
    return text

In [None]:
# Sentiment analysis
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

In [None]:
# Adding cleaned text to chat_data 
for customer in chat_data:
    for message in customer['chat_history']:
        if 'message' in message:
            message['cleaned_text'] = clean_text(message['message'])
        elif 'agent_message' in message:
            message['cleaned_text'] = clean_text(message['agent_message'])

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)



# Playground 

In [None]:
# Topic modeling
all_texts = [message['cleaned_text'] for customer in chat_data for message in customer['chat_history'] if 'cleaned_text' in message]


In [None]:
all_texts

In [None]:

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(all_texts)


In [None]:
X

In [None]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

In [None]:
def print_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))



In [None]:
lda.components_[3]

In [None]:
print_topics(lda, vectorizer.get_feature_names_out(), 3)

In [None]:
message['cleaned_text']

In [None]:
customer

In [None]:
chat_data