# Data Loading 

In [1]:
import pandas as pd
import re
from datasets import load_dataset
from textwrap import wrap
import json
import os

os.makedirs("prepared", exist_ok=True)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np

In [3]:
hf_datasets = {}

hf_datasets["customer_support1"] = load_dataset("Tobi-Bueck/customer-support-tickets")
hf_datasets["helpdesk_synth"] = load_dataset("Console-AI/IT-helpdesk-synthetic-tickets")
hf_datasets["it_troubleshooting"] = load_dataset("UmerSajid/IT-Troubleshooting-Dataset")
hf_datasets["techqa"] = load_dataset("nvidia/TechQA-RAG-Eval")
#new added
hf_datasets["customer_it_support"] = load_dataset("Talhat/Customer_IT_Support") 
hf_datasets["customer_support2"] = load_dataset("gorkemsevinc/customer_support_tickets")


local_paths = {
    "tech_support_dialogue": "data/Troubleshooting Dialogue/tech_support_dataset.csv",
    "routing_tickets": "data/Routing Engine/all_tickets_processed_improved_v3.csv",
    "it_support_ticket_data": "data/Routing Engine/IT Support Ticket Data.csv",
}

local_dfs = {name: pd.read_csv(path) for name, path in local_paths.items()}


In [4]:
def clean_text(t):
    if t is None:
        return ""
    t = str(t)

    # Remove emails / PII
    t = re.sub(r'\S+@\S+', '[EMAIL]', t)
    t = re.sub(r'\b\d{10,15}\b', '[PHONE]', t)

    # Replace IPs
    t = re.sub(r'\b\d{1,3}(?:\.\d{1,3}){3}\b', '[IP]', t)

    # Preserve error codes
    t = re.sub(r'(ORA-\d+)', r' \1 ', t)
    t = re.sub(r'(0x[0-9A-Fa-f]+)', r' \1 ', t)
    t = re.sub(r'(HTTP\s?[45]\d{2})', r' \1 ', t)
    t = re.sub(r'(SQLSTATE\[\w+\])', r' \1 ', t)

    # Remove HTML
    t = re.sub(r'<[^>]+>', '', t)

    # Normalize whitespace
    t = re.sub(r'\s+', ' ', t).strip()

    return t


# Preparing Hugging Face Datasets

### 1] Tobi-Bueck/customer-support-tickets

In [5]:
df_hf_cust = hf_datasets["customer_support1"]["train"].to_pandas()
df_hf_cust["text"] = (df_hf_cust["subject"].fillna("") + " " + df_hf_cust["body"].fillna("")).apply(clean_text)
df_hf_cust["answer"] = df_hf_cust["answer"]
df_hf_cust["category"] = df_hf_cust["queue"]
df_hf_cust["priority"] = df_hf_cust["priority"]


In [6]:
len(df_hf_cust)

61765

In [7]:
df_hf_cust = df_hf_cust[df_hf_cust["language"].isin(["en"])]

In [8]:
len(df_hf_cust)

28261

In [9]:
df_hf_cust.isnull().sum()

subject      3639
body            1
answer          6
type            0
queue           0
priority        0
language        0
version     11923
tag_1           0
tag_2          16
tag_3         113
tag_4        2584
tag_5       11960
tag_6       20353
tag_7       24562
tag_8       26653
text            0
category        0
dtype: int64

In [10]:
df_hf_cust.head()

Unnamed: 0,subject,body,answer,type,queue,priority,language,version,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,text,category
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...","Thank you for reaching out, <name>. We are awa...",Incident,Technical Support,high,en,51.0,Account,Disruption,Outage,IT,Tech Support,,,,"Account Disruption Dear Customer Support Team,...",Technical Support
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Thank you for your inquiry. Our products suppo...,Request,Returns and Exchanges,medium,en,51.0,Product,Feature,Tech Support,,,,,,Query About Smart Home System Integration Feat...,Returns and Exchanges
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",We appreciate you reaching out with your billi...,Request,Billing and Payments,low,en,51.0,Billing,Payment,Account,Documentation,Feedback,,,,Inquiry Regarding Invoice Details Dear Custome...,Billing and Payments
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Thank you for your inquiry. Our product suppor...,Problem,Sales and Pre-Sales,medium,en,51.0,Product,Feature,Feedback,Tech Support,,,,,Question About Marketing Agency Software Compa...,Sales and Pre-Sales
5,Feature Query,"Dear Customer Support,\n\nI hope this message ...",Thank you for your inquiry. Please specify whi...,Request,Technical Support,high,en,51.0,Feature,Product,Documentation,Feedback,,,,,"Feature Query Dear Customer Support,\n\nI hope...",Technical Support


### 2] Console-AI/IT-helpdesk-synthetic-tickets

In [11]:
df_hf_helpdesk = hf_datasets["helpdesk_synth"]["train"].to_pandas()
df_hf_helpdesk["text"] = (df_hf_helpdesk["subject"].fillna("") + " " + df_hf_helpdesk["description"].fillna("")).apply(clean_text)
df_hf_helpdesk["answer"] = None
df_hf_helpdesk["category"] = df_hf_helpdesk["category"]
df_hf_helpdesk["priority"] = df_hf_helpdesk["priority"]


In [12]:
len(df_hf_helpdesk)

500

In [13]:
df_hf_helpdesk.isnull().sum()

id                  0
subject             0
description         0
priority            0
category            0
createdAt           0
requesterEmail      0
text                0
answer            500
dtype: int64

In [14]:
df_hf_helpdesk.head()

Unnamed: 0,id,subject,description,priority,category,createdAt,requesterEmail,text,answer
0,1aiu3lrqi,Hey IT! Our network printer keeps disconnecting.,Hey IT! Our network printer keeps disconnectin...,Medium,Network,2024-10-28T18:36:55.004Z,jane.doe@acme.co,Hey IT! Our network printer keeps disconnectin...,
1,kz5mjjpox,Re: [Acme IT] Re: Ticket #98765 - Access Issue...,This is a follow-up to your previous request #...,High,Network,2024-10-28T18:36:56.156Z,user123@acme.co,Re: [Acme IT] Re: Ticket #98765 - Access Issue...,
2,86eza0fwq,Software Conflict Causing App Crashes,Hey team! :wave: We're experiencing some inter...,High,Software,2024-10-28T18:36:54.644Z,user@acme.co,Software Conflict Causing App Crashes Hey team...,
3,jtw509e3n,j.doe@acme.co Google Calendar Setup Assistance,We need to set up a new Google Calendar for te...,Medium,Software,2024-10-28T18:36:54.524Z,j.doe@acme.co,[EMAIL] Google Calendar Setup Assistance We ne...,
4,tso616mbn,Software Access: Asana Project for Jordan Smith,Jordan Smith is part of the Project Management...,Medium,Software,2024-10-28T18:36:54.816Z,jordan.smith@acme.co,Software Access: Asana Project for Jordan Smit...,


### 3] UmerSajid/IT-Troubleshooting-Dataset

In [15]:
df_hf_trouble = hf_datasets["it_troubleshooting"]["train"].to_pandas()
df_hf_trouble["text"] = (
    df_hf_trouble["Issue"].fillna("") + " " +
    df_hf_trouble["Symptoms"].fillna("") + " " +
    df_hf_trouble["Common Causes"].fillna("") + " " 
).apply(clean_text)

df_hf_trouble["answer"] = df_hf_trouble["Solution Steps"].apply(clean_text)
df_hf_trouble["category"] = df_hf_trouble["Category"]
df_hf_trouble["priority"] = df_hf_trouble["Severity"]

In [16]:
len(df_hf_trouble)

10500

In [17]:
df_hf_trouble.isnull().sum()

ID                           0
Category                     0
Issue                        0
Symptoms                     0
Solution Steps               0
Severity                     0
Estimated Resolution Time    0
Common Causes                0
Keywords                     0
Urdu Solution                0
Documentation Link           0
text                         0
answer                       0
category                     0
priority                     0
dtype: int64

In [18]:
df_hf_trouble.head()

Unnamed: 0,ID,Category,Issue,Symptoms,Solution Steps,Severity,Estimated Resolution Time,Common Causes,Keywords,Urdu Solution,Documentation Link,text,answer,category,priority
0,1,Cloud Computing,AWS instance not starting - Variant 30,Instance stuck in 'pending' state,"Check instance status in AWS console, review s...",Low,18 hours,"Typical configuration error, outdated software...","cloud computing, aws, instance stuck in 'pendi...","حل: Check instance status in AWS console, revi...",https://docs.aws.amazon.com/AWSEC2/latest/User...,AWS instance not starting - Variant 30 Instanc...,"Check instance status in AWS console, review s...",Cloud Computing,Low
1,2,Cloud Computing,AWS instance not starting - Variant 24,Instance stuck in 'pending' state,"Check instance status in AWS console, review s...",High,9 hours,"Typical configuration error, outdated software...","cloud computing, aws, instance stuck in 'pendi...","حل: Check instance status in AWS console, revi...",https://docs.aws.amazon.com/AWSEC2/latest/User...,AWS instance not starting - Variant 24 Instanc...,"Check instance status in AWS console, review s...",Cloud Computing,High
2,3,Cloud Computing,AWS instance not starting - Variant 25,Instance stuck in 'pending' state,"Check instance status in AWS console, review s...",Low,16 hours,"Typical configuration error, outdated software...","cloud computing, aws, instance stuck in 'pendi...","حل: Check instance status in AWS console, revi...",https://docs.aws.amazon.com/AWSEC2/latest/User...,AWS instance not starting - Variant 25 Instanc...,"Check instance status in AWS console, review s...",Cloud Computing,Low
3,4,Cloud Computing,AWS instance not starting - Variant 5,Instance stuck in 'pending' state,"Check instance status in AWS console, review s...",High,33 hours,"Typical configuration error, outdated software...","cloud computing, aws, instance stuck in 'pendi...","حل: Check instance status in AWS console, revi...",https://docs.aws.amazon.com/AWSEC2/latest/User...,AWS instance not starting - Variant 5 Instance...,"Check instance status in AWS console, review s...",Cloud Computing,High
4,5,Cloud Computing,AWS instance not starting - Variant 48,Instance stuck in 'pending' state,"Check instance status in AWS console, review s...",Low,7 hours,"Typical configuration error, outdated software...","cloud computing, aws, instance stuck in 'pendi...","حل: Check instance status in AWS console, revi...",https://docs.aws.amazon.com/AWSEC2/latest/User...,AWS instance not starting - Variant 48 Instanc...,"Check instance status in AWS console, review s...",Cloud Computing,Low


### 4] nvidia/TechQA-RAG-Eval

In [19]:
df_hf_techqa = hf_datasets["techqa"]["train"].to_pandas()
df_hf_techqa["text"] = df_hf_techqa["question"].apply(clean_text)
df_hf_techqa["answer"] = df_hf_techqa["answer"].apply(clean_text)
df_hf_techqa["category"] = "Technical"
df_hf_techqa["priority"] = None


In [20]:
df_hf_techqa = df_hf_techqa[~df_hf_techqa["answer"].isin(["-"])]

In [21]:
len(df_hf_techqa)

610

In [22]:
df_hf_techqa.isnull().sum()

id                 0
question           0
answer             0
is_impossible      0
contexts           0
text               0
category           0
priority         610
dtype: int64

In [23]:
df_hf_techqa.head()

Unnamed: 0,id,question,answer,is_impossible,contexts,text,category,priority
0,TRAIN_Q000,User environment variables no longer getting p...,"To work around the issue, set environment vari...",False,"[{'filename': 'swg21996508.txt', 'text': 'Titl...",User environment variables no longer getting p...,Technical,
1,TRAIN_Q001,Netcool/Impact (all versions): How is the Exit...,This is because the Exit() parser function in ...,False,"[{'filename': 'swg21675316.txt', 'text': 'Titl...",Netcool/Impact (all versions): How is the Exit...,Technical,
3,TRAIN_Q003,How to configure SSL mutual authentication in ...,The following steps help guide you through the...,False,"[{'filename': 'swg21179559.txt', 'text': 'Titl...",How to configure SSL mutual authentication in ...,Technical,
5,TRAIN_Q005,What happened to load.rules FAQ example?\n\nTh...,Netcool Technical Support Guide to rules file ...,False,"[{'filename': 'swg21903536.txt', 'text': 'Titl...",What happened to load.rules FAQ example? The l...,Technical,
6,TRAIN_Q006,Is ITNM exposed to Apache CXF vulnerability (C...,CVEID: CVE-2017-3156 [http://cve.mitre.org/cgi...,False,"[{'filename': 'swg22008493.txt', 'text': 'Titl...",Is ITNM exposed to Apache CXF vulnerability (C...,Technical,


### 5] Talhat/Customer_IT_Support

In [24]:
df_hf_talhat = hf_datasets["customer_it_support"]["train"].to_pandas()
df_hf_talhat["text"] = df_hf_talhat["body"].apply(clean_text)
df_hf_talhat["answer"] = df_hf_talhat["answer"].apply(clean_text)
df_hf_talhat["category"] = df_hf_talhat["queue"]
df_hf_talhat["priority"] = None


In [25]:
len(df_hf_talhat)

1112

In [26]:
df_hf_talhat.isnull().sum()

body           0
answer         0
type           0
queue          0
text           0
category       0
priority    1112
dtype: int64

In [27]:
df_hf_talhat.head()

Unnamed: 0,body,answer,type,queue,text,category,priority
0,"Hello Customer Support,\n\nI am writing to exp...",Subject: Re: Issues with HP DeskJet 3755 Print...,Incident,Product Support,"Hello Customer Support, I am writing to expres...",Product Support,
1,"Hello Customer Support,\n\nI hope this message...",Subject: Re: Request to Address Billing Statem...,Request,Billing and Payments,"Hello Customer Support, I hope this message fi...",Billing and Payments,
2,"Dear IT Consulting Firm Support, \n\nOur clien...","Dear Customer, Thank you for reaching out. We ...",Incident,Customer Service,"Dear IT Consulting Firm Support, Our client, ,...",Customer Service,
3,"Hello Tech Online Store Support Team,\n\nI am ...","Hi, Thank you for contacting us about the conn...",Request,Customer Service,"Hello Tech Online Store Support Team, I am rea...",Customer Service,
4,"Dear Customer Support Team,\n\nI am writing to...","Dear , Thank you for reaching out regarding th...",Incident,Technical Support,"Dear Customer Support Team, I am writing to ex...",Technical Support,


### 6] gorkemsevinc/customer_support_tickets

In [28]:
df_hf_gorkem = hf_datasets["customer_support2"]["train"].to_pandas()

df_hf_gorkem["text"] = df_hf_gorkem["Combined Text"].apply(clean_text)
df_hf_gorkem["answer"] = None
df_hf_gorkem["category"] = df_hf_gorkem["Ticket Type"]
df_hf_gorkem["priority"] = df_hf_gorkem["Ticket Priority"]

In [29]:
len(df_hf_gorkem)

8469

In [30]:
df_hf_gorkem.isnull().sum()

Customer Email          0
Product Purchased       0
Ticket Type             0
Ticket Subject          0
Combined Text           0
Ticket Priority         0
text                    0
answer               8469
category                0
priority                0
dtype: int64

In [31]:
df_hf_gorkem.head()

Unnamed: 0,Customer Email,Product Purchased,Ticket Type,Ticket Subject,Combined Text,Ticket Priority,text,answer,category,priority
0,carrollallison@example.com,gopro hero,technical issue,product setup,i'm having an issue with the gopro hero. pleas...,critical,i'm having an issue with the gopro hero. pleas...,,technical issue,critical
1,clarkeashley@example.com,lg smart tv,technical issue,peripheral compatibility,i'm having an issue with the lg smart tv. plea...,critical,i'm having an issue with the lg smart tv. plea...,,technical issue,critical
2,gonzalestracy@example.com,dell xps,technical issue,network problem,i'm facing a problem with my dell xps. the del...,low,i'm facing a problem with my dell xps. the del...,,technical issue,low
3,bradleyolson@example.org,microsoft office,billing inquiry,account access,i'm having an issue with the microsoft office....,low,i'm having an issue with the microsoft office....,,billing inquiry,low
4,bradleymark@example.com,autodesk autocad,billing inquiry,data loss,i'm having an issue with the autodesk autocad....,low,i'm having an issue with the autodesk autocad....,,billing inquiry,low


# Preparing CSV Datasets 

### 1] data/Troubleshooting Dialogue/tech_support_dataset.csv

In [32]:
df_local_trouble = local_dfs["tech_support_dialogue"]
df_local_trouble["text"] = df_local_trouble["Customer_Issue"].apply(clean_text)
df_local_trouble["answer"] = df_local_trouble["Tech_Response"].apply(clean_text)
df_local_trouble["category"] = None
df_local_trouble["priority"] = None 


In [33]:
len(df_hf_trouble)

10500

In [34]:
df_hf_trouble.isnull().sum()

ID                           0
Category                     0
Issue                        0
Symptoms                     0
Solution Steps               0
Severity                     0
Estimated Resolution Time    0
Common Causes                0
Keywords                     0
Urdu Solution                0
Documentation Link           0
text                         0
answer                       0
category                     0
priority                     0
dtype: int64

In [35]:
df_local_trouble.head()

Unnamed: 0,Conversation_ID,Customer_Issue,Tech_Response,Resolution_Time,Issue_Category,Issue_Status,text,answer,category,priority
0,CONV-0001,Cannot connect to Wi-Fi,Clear cache and remove unnecessary programs.,92 minutes,Software,Pending,Cannot connect to Wi-Fi,Clear cache and remove unnecessary programs.,,
1,CONV-0002,Software installation failure,Reinstall the printer drivers.,76 minutes,Account,Pending,Software installation failure,Reinstall the printer drivers.,,
2,CONV-0003,Cannot connect to Wi-Fi,Clear cache and remove unnecessary programs.,50 minutes,Network,Resolved,Cannot connect to Wi-Fi,Clear cache and remove unnecessary programs.,,
3,CONV-0004,Forgot password,Reset your password using the link provided.,97 minutes,Performance,Pending,Forgot password,Reset your password using the link provided.,,
4,CONV-0005,Software installation failure,Follow the software installation guide.,110 minutes,Performance,Pending,Software installation failure,Follow the software installation guide.,,


### 2] data/Routing Engine/all_tickets_processed_improved_v3.csv

In [36]:
df_local_routing = local_dfs["routing_tickets"]
df_local_routing["text"] = df_local_routing["Document"].apply(clean_text)
df_local_routing["answer"] = None
df_local_routing["category"] = df_local_routing["Topic_group"]
df_local_routing["priority"] = None


In [37]:
len(df_local_routing)

47837

In [38]:
df_local_routing.isnull().sum()

Document           0
Topic_group        0
text               0
answer         47837
category           0
priority       47837
dtype: int64

In [39]:
df_local_routing.head()

Unnamed: 0,Document,Topic_group,text,answer,category,priority
0,connection with icon icon dear please setup ic...,Hardware,connection with icon icon dear please setup ic...,,Hardware,
1,work experience user work experience user hi w...,Access,work experience user work experience user hi w...,,Access,
2,requesting for meeting requesting meeting hi p...,Hardware,requesting for meeting requesting meeting hi p...,,Hardware,
3,reset passwords for external accounts re expir...,Access,reset passwords for external accounts re expir...,,Access,
4,mail verification warning hi has got attached ...,Miscellaneous,mail verification warning hi has got attached ...,,Miscellaneous,


### 3] data/Routing Engine/IT Support Ticket Data.csv

In [40]:
df_local_dept = local_dfs["it_support_ticket_data"]
df_local_dept["text"] = df_local_dept["Body"].apply(clean_text)
df_local_dept["answer"] = None
df_local_dept["category"] = df_local_dept["Department"]
df_local_dept["priority"] = df_local_dept["Priority"]


In [41]:
len

<function len(obj, /)>

In [42]:
df_local_dept.isnull().sum()

Unnamed: 0        0
Body              1
Department        0
Priority          0
Tags              0
text              0
answer        29651
category          0
priority          0
dtype: int64

In [43]:
df_local_dept.head()

Unnamed: 0.1,Unnamed: 0,Body,Department,Priority,Tags,text,answer,category,priority
0,0,"Dear Customer Support Team,I am writing to rep...",Technical Support,high,"['Account', 'Disruption', 'Outage', 'IT', 'Tec...","Dear Customer Support Team,I am writing to rep...",,Technical Support,high
1,1,"Dear Customer Support Team,I hope this message...",Returns and Exchanges,medium,"['Product', 'Feature', 'Tech Support']","Dear Customer Support Team,I hope this message...",,Returns and Exchanges,medium
2,2,"Dear Customer Support Team,I hope this message...",Billing and Payments,low,"['Billing', 'Payment', 'Account', 'Documentati...","Dear Customer Support Team,I hope this message...",,Billing and Payments,low
3,3,"Dear Support Team,I hope this message reaches ...",Sales and Pre-Sales,medium,"['Product', 'Feature', 'Feedback', 'Tech Suppo...","Dear Support Team,I hope this message reaches ...",,Sales and Pre-Sales,medium
4,4,"Dear Customer Support,I hope this message reac...",Technical Support,high,"['Feature', 'Product', 'Documentation', 'Feedb...","Dear Customer Support,I hope this message reac...",,Technical Support,high


# Final Combined 

In [44]:
df_all = pd.concat([
    df_hf_cust,
    df_hf_helpdesk,
    df_hf_trouble,
    df_hf_techqa,
    df_hf_gorkem,
    df_hf_talhat,
    df_local_trouble,
    df_local_routing,
    df_local_dept
], ignore_index=True)

df_all = df_all[["text", "answer", "category", "priority"]]
df_all = df_all.dropna(subset=["text"])


In [45]:
len(df_all)

128836

In [46]:
df_all.isnull().sum()

text            0
answer      86463
category     1896
priority    51455
dtype: int64

In [47]:
df_all.head()

Unnamed: 0,text,answer,category,priority
0,"Account Disruption Dear Customer Support Team,...","Thank you for reaching out, <name>. We are awa...",Technical Support,high
1,Query About Smart Home System Integration Feat...,Thank you for your inquiry. Our products suppo...,Returns and Exchanges,medium
2,Inquiry Regarding Invoice Details Dear Custome...,We appreciate you reaching out with your billi...,Billing and Payments,low
3,Question About Marketing Agency Software Compa...,Thank you for your inquiry. Our product suppor...,Sales and Pre-Sales,medium
4,"Feature Query Dear Customer Support,\n\nI hope...",Thank you for your inquiry. Please specify whi...,Technical Support,high


In [48]:
output_file = "prepared/lm_corpus.txt"



with open(output_file, "w", encoding="utf8") as f:
    for _, row in df_all.iterrows():
        # Start with the Question/Input text
        full_text = row["text"].strip()
        
        # Append the Answer to the SAME line
        if isinstance(row["answer"], str) and len(row["answer"]) > 1:
            full_text += " " + row["answer"].strip()
            
        f.write(full_text + "\n")

In [49]:
df_class = df_all[df_all["category"].notna()]
df_class.to_csv("prepared/classification_dataset.csv", index=False)

In [50]:
import json
import pandas as pd

CMD_CHAT = "Respond as an IT support assistant."
CMD_CLASSIFY = "Categorize the IT support ticket."
CMD_PRIORITY = "Determine if the ticket is high, medium, or low priority."

PRIO_MAP = {
    "critical": "high",
    "urgent": "high",
    "high": "high",
    "medium": "medium",
    "moderate": "medium",
    "low": "low",
    "minimal": "low"
}

instr_rows = []

for _, row in df_all.iterrows():
    text = row.get("text", "")
    answer = row.get("answer", None)
    category = row.get("category", None)
    priority = row.get("priority", None)

    if isinstance(answer, str) and len(answer.strip()) > 1:
        instr_rows.append({
            "instruction": CMD_CHAT,
            "input": text,
            "output": answer
        })

    if pd.notna(category):
        instr_rows.append({
            "instruction": CMD_CLASSIFY,
            "input": text,
            "output": str(category)
        })

    if pd.notna(priority):

        raw_prio = str(priority).lower().strip().replace(".", "")
        final_prio = PRIO_MAP.get(raw_prio, raw_prio) 
        
        instr_rows.append({
            "instruction": CMD_PRIORITY,
            "input": text,
            "output": final_prio
        })

output_file = "prepared/instruction_dataset.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for r in instr_rows:
        f.write(json.dumps(r) + "\n")

print(f"Created standardized dataset: {output_file}")
print(f"Total Rows: {len(instr_rows)}")
print("Instructions are now FIXED (No random variations).")
print("Priorities are now NORMALIZED (high/medium/low).")

Created standardized dataset: prepared/instruction_dataset.jsonl
Total Rows: 246692
Instructions are now FIXED (No random variations).
Priorities are now NORMALIZED (high/medium/low).


In [57]:
import os
import json

OUTPUT_FILE = "prepared/kb_chunks.jsonl"
os.makedirs("prepared", exist_ok=True)

def is_valid(question, answer, category):
    if not isinstance(question, str) or not isinstance(answer, str):
        return False
    if len(question.strip()) < 10 or len(answer.strip()) < 20:
        return False
    if category is None or str(category).strip().lower() in ["none", "nan", ""]:
        return False
    return True

MAX_LEN = 1200
entries = []

for _, row in df_all.iterrows():
    q = str(row.get("text", "")).strip()
    a = str(row.get("answer", "")).strip()
    category = str(row.get("category", "")).strip()

    if not is_valid(q, a, category):
        continue

    # Build the combined example
    combined = f"User:\n{q}\n\nAssistant:\n{a}"

    # If short enough → no splitting at all
    if len(combined) <= MAX_LEN:
        entries.append({
            "text": combined,
            "ticket_type": category,
            "type": "ticket_resolution"
        })
        continue

    # If too long → split ONLY by paragraphs, preserving structure
    paragraphs = combined.split("\n\n")
    buffer = ""

    for p in paragraphs:
        if len(buffer) + len(p) < MAX_LEN:
            buffer += p + "\n\n"
        else:
            # Only add if buffer contains BOTH User & Assistant
            if "User:" in buffer and "Assistant:" in buffer:
                entries.append({
                    "text": buffer.strip(),
                    "ticket_type": category,
                    "type": "ticket_resolution"
                })
            buffer = p + "\n\n"

    # Add the remaining buffer
    if buffer.strip() and "User:" in buffer and "Assistant:" in buffer:
        entries.append({
            "text": buffer.strip(),
            "ticket_type": category,
            "type": "ticket_resolution"
        })

# Write to JSONL
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for e in entries:
        f.write(json.dumps(e) + "\n")

print("KB generated:", len(entries))


KB generated: 37525
