In [1]:
import pandas as pd
import re
import json
import numpy as np
import os

In [2]:
ds = pd.read_csv('kaggle_dataset/twcs/twcs.csv')
print(f"Number of rows in ds: {ds.shape[0]}")

Number of rows in ds: 2811774


In [3]:
def author_id_frequency_distribution(df):
    """
    Function to get the frequency distribution of author_ids in the dataframe, 
    including only those where author_id is a string and not a numeric string.
    
    Parameters:
    - df (pandas.DataFrame): The dataset of tweets.
    
    Returns:
    - pandas.Series: The frequency distribution of author_id.
    """
    # Filter out rows where author_id is a string and not numeric
    df_filtered = df[df["author_id"].apply(lambda x: isinstance(x, str) and not x.isdigit())]
    
    # Calculate the frequency distribution of author_id
    freq_distribution = df_filtered["author_id"].value_counts()
    
    return freq_distribution

In [4]:
author_id_frequency_distribution(ds)

author_id
AmazonHelp        169840
AppleSupport      106860
Uber_Support       56270
SpotifyCares       43265
Delta              42253
                   ...  
JackBox              266
OfficeSupport        218
AskDSC               210
CarlsJr              196
HotelTonightCX       152
Name: count, Length: 108, dtype: int64

In [5]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove everything starting with @ (user mentions and company mentions)
    text = re.sub(r'@\S+', '', text)  # This removes all mentions starting with @
    
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)
    
    # Remove punctuation (except for spaces and alphanumeric characters)
    text = re.sub(r'[^\w\s]', '', text)

    # Remove initials at the end of messages (e.g., "CB", "HP")
    text = re.sub(r'\b[A-Z]{2}\b$', '', text)

    # Normalize spaces (replace multiple spaces with one)
    text = ' '.join(text.split())

    return text

In [6]:
def process_conversations(df, system_message, company_name=None):
    """
    Function to process conversations from a DataFrame.
    It turns messages into threads compatible with OpenAI finetuning module
    
    Parameters:
    - df (pandas.DataFrame): The dataset of tweets.
    - system_message (str): The system message for the assistant.
    - company_name (str): The name of the company to filter conversations (optional).
    
    Returns:
    - List of conversations in OpenAI format.
    """
    
    # Set to track processed tweet IDs
    processed = set()

    conversations = []

    # Iterate through dataset to construct conversations
    for index, row in df.iterrows():
        tweet_id = row["tweet_id"]
        
        # Skip if already processed
        if tweet_id in processed:
            continue
        
        # Start new conversation if it's an inbound message (user) and not a reply
        if row["inbound"] and pd.isna(row["in_response_to_tweet_id"]):
            conversation = [{"role": "system", "content": system_message}]
            queue = [tweet_id]  # Use a queue to process messages in order
            last_message_is_assistant = False
            company_assistant_messages = True

            # Process conversation thread
            while queue:
                current_id = queue.pop(0)

                # Check if current_id exists in df
                if current_id not in df['tweet_id'].values:
                    continue
                
                # Get message details
                message = df[df["tweet_id"] == current_id].iloc[0]
                role = "user" if message["inbound"] else "assistant"
                
                # If company name is provided, filter outbound messages by company author_id
                if company_name and message["inbound"] == False and message["author_id"] != company_name:
                    company_assistant_messages = False
                    break
                
                # Add message to conversation
                conversation.append({"role": role, "content": clean_text(message["text"])})
                
                # Check if this message is from the assistant
                if role == "assistant":
                    last_message_is_assistant = True
                else:
                    last_message_is_assistant = False
                
                # Mark as processed
                processed.add(current_id)
                
                # Find all responses to this message
                next_responses = df[df["in_response_to_tweet_id"] == current_id]
                
                if not next_responses.empty:
                    # Add tweet_id of all responses to the queue
                    for _, response in next_responses.iterrows():
                        queue.append(response["tweet_id"])

            if last_message_is_assistant and company_assistant_messages:
                conversations.append(conversation)

    return conversations

In [7]:
def process_in_chunks(df, chunk_size, system_message, company_name=None):
    """
    Process the DataFrame in chunks to get conversations.
    
    Parameters:
    - df (pandas.DataFrame): The dataset of tweets.
    - chunk_size (int): The size of each chunk.
    - system_message (str): The system message for the assistant.
    - company_name (str): The name of the company to filter conversations (optional).
    
    Returns:
    - List of conversations in the required format.
    """
    conversations = []

    chunks = np.array_split(df, len(df) // chunk_size + 1)

    for chunk in chunks:
        print(f"Processing chunk with {len(chunk)} records.")
        chunk_conversations = process_conversations(chunk, system_message, company_name)
        conversations.extend(chunk_conversations)
    
    return conversations

In [8]:
system_message = "You are a polite customer assistant whose goal is to provide effective help."
chunk_size = 50000  # You can adjust the chunk size based on your system's capacity
company_name = 'VirginTrains'

# Assuming `ds` is your dataframe
conversations = process_in_chunks(ds, chunk_size, system_message, company_name)

print(f"Total number of conversations: {len(conversations)}")

  return bound(*args, **kwds)


Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49330 records.
Processing chunk with 49329 records.
Processing chunk with 49329 records.
Processing chunk with 49329 records.
Processing chunk with 49329 records.
Processing chunk with 49329 records.
Processing chunk with 49329 records.
P

In [9]:
conversations[0:2]

[[{'role': 'system',
   'content': 'You are a polite customer assistant whose goal is to provide effective help.'},
  {'role': 'user',
   'content': 'so i wait almost 3 hours and then they are rude and arrogant amp unhelpful after which she is raising a technical case'},
  {'role': 'assistant',
   'content': 'If youre unhappy with your experience on this call please contact us on our website'}],
 [{'role': 'system',
   'content': 'You are a polite customer assistant whose goal is to provide effective help.'},
  {'role': 'user',
   'content': 'Gotta luv finally get 2 call cntr they tried to charge me 176 for a ticket which can b purchased for 88 online'},
  {'role': 'assistant',
   'content': 'When amending a ticket the system would show a fare that matches your previous purchase but this may be higher'}]]

In [None]:
# Save to a JSONL file
output_file = "openai_files/multi_turn_conversation_train.jsonl"

with open(output_file, "w", encoding="utf-8") as file:
    for conversation in conversations:
        file.write(json.dumps(conversation, ensure_ascii=False) + "\n")

print(f"Cleaned conversations saved to {output_file}")