#**Step 1: Import necessary libraries**

In [None]:
!pip install transformers datasets
from ctypes import sizeof
from transformers import pipeline
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import login
import re
from datasets import Dataset, load_metric
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from tqdm import tqdm
from transformers import pipeline, logging
import os

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

# **Step 2: Global variables and token setup**

In [None]:
global sizeOfData
global token_ID
global model_id

In [None]:
token_ID="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"
model_id="bitext/Mistral-7B-Customer-Support"
sizeOfData=1000 # Total size of the synthetic data

# **Step 3: Login and generator setup**

In [None]:
# Suppress the warning messages
logging.set_verbosity_error()

# Login to Huggingface and setup the text generation pipeline
login(token=token_ID)
generator = pipeline("text-generation", model=model_id, device_map="cuda")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

# **Step 4: Define categories and helper functions**

In [None]:
# Define the sentiment categories and their corresponding labels
categories = {
    'Strong Negative': 0,
    'Mild Negative': 1,
    'Neutral': 2,
    'Mild Positive': 3,
    'Strong Positive': 4
}

# Function to generate ticket text
def generate_ticket(prompt):
    response = generator(prompt, max_length=100, num_return_sequences=1, do_sample=True)
    return response[0]['generated_text']

# Function to clean the generated text
def clean_text(text, prompt):
    # Remove the prompt part from the generated text
    cleaned_text = re.sub(re.escape(prompt), '', text, flags=re.IGNORECASE).strip()
    # Remove any leading or trailing unwanted characters
    cleaned_text = cleaned_text.strip()
    return cleaned_text

# **Step 5: Generate synthetic data**

In [None]:
# Generate synthetic data
data = []
total_iterations = len(categories) * (sizeOfData // 5)

# Use tqdm.notebook to ensure proper handling in Jupyter notebooks
with tqdm(total=total_iterations, desc="Generating tickets", unit="ticket", leave=True) as pbar:
    for category, label in categories.items():
        for _ in range(sizeOfData // 5):
            prompt = f"Imagine you are a customer and in about 100 words write a ticket with a {category} sentiment. Output only a message:"
            ticket = generate_ticket(prompt)
            cleaned_ticket = clean_text(ticket, prompt)
            data.append({'ticket': cleaned_ticket, 'sentiment': category, 'label': label})
            pbar.update(1)

# Convert to DataFrame
df = pd.DataFrame(data)

In [None]:
# Split the data into train, test, and validation sets
train, test = train_test_split(df, test_size=0.3, stratify=df['label'])
test, validation = train_test_split(test, test_size=1/3, stratify=test['label'])

# Define the path for the data folder
data_folder = 'data'

# Create the data folder if it doesn't exist
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
    print(f"'{data_folder}' folder created successfully.")
else:
    print(f"'{data_folder}' folder already exists.")
# Save the datasets to CSV files
train.to_csv('data/train_data.csv', index=False)
test.to_csv('data/test_data.csv', index=False)
validation.to_csv('data/validation_data.csv', index=False)
print("Synthetic data Generated")

# **Explanation of Choices Made During Data Generation**



1. **Size of the Data Set**: The `sizeOfData` was set to 1000, ensuring a balanced distribution across the five sentiment categories. This ensures that the model is trained on a diverse set of examples from each category.

2. **Batching**: The data was generated in batches corresponding to each sentiment category. This approach ensures that each category has an equal representation in the dataset, which is crucial for training a balanced model.

3. **Deduplication**: While deduplication was not explicitly implemented, the cleaning function `clean_text` removes any repetitive parts of the generated text (such as the prompt), reducing redundancy in the dataset.

4. **Avoiding Overfitting and Maximizing Generalization**:
   - The text generation model (`distilgpt2`) was used with `do_sample=True` to introduce randomness in the generated text, ensuring diversity.
   - The data was split into training, testing, and validation sets with stratified sampling to maintain the label distribution across splits. This helps in evaluating the model’s generalization capability on unseen data.

5. **Cleaning Process**: The `clean_text` function ensures that the generated text is devoid of any prompts or unwanted characters, resulting in cleaner data that better represents real-world customer tickets.

6. **Model Selection**: `Mistral-7B-Customer-Support` model was selected due to its optimal balance between performance and computational efficiency, making it capable of generating substantial amounts of text quickly while maintaining high-quality output. Additionally, this model is specifically fine-tuned for customer support scenarios, ensuring that the generated text closely aligns with real-world customer interactions. Its ability to handle the nuances of customer sentiment and provide contextually relevant responses makes it particularly well-suited for generating customer ticket data.

7. **Device Utilization**: The model was run on CUDA (GPU) to speed up the text generation process.

8. **Data Storage**: The generated data is stored in separate CSV files (`train_data.csv`, `test_data.csv`, and `validation_data.csv`) for ease of access and further processing.

By carefully managing the size of the data, using appropriate generation and cleaning techniques, and ensuring a balanced split, the goal was to create a robust dataset that the model can effectively learn from while avoiding overfitting.
"""