In [1]:
import re
import pandas as pd
import nltk
nltk.download('punkt')

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^w\s.,!?]', '', text)
    return text.strip()

def preprocess_df(df):
    df['cleaned_df'] = df['Transcript_Text'].apply(clean_text)
    return df

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/devarshimahajan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from transformers import pipeline

def load_sentiment_pipeline():
    return pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")

def get_sentiment(pipe, text):
    result = pipe(text[:512])[0]
    return result['label'], result['score']

In [None]:
import openai
import os
from dotenv import load_dotenv

load_dotenv()
client = openai.OpenAI(api_key = "sk-...")

# client = openai.OpenAI(api_key=api_key)

def classify_feedback(transcript):
    prompt = f""" You are a Customer Support Analyzer.

    Transcript: "{transcript}"

    1. Classify the type of feedback: Complaint, Compliment, Suggestion or Neutral
    2. Identify Service or Product if mentioned
    3. Provide reason in 2-3 words and make it clear and simple

    Respond in JSON format with keys: type, product, reason.

    """

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
    )

    return response.choices[0].message.content


    

In [47]:
def summarize_call(transcript):
    prompt = f"""
    you are a helpful assistant. Summarize this call in 1-2 sentences.

    Transcript: "{transcript}"

    Summary:
    """

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
    )
    
    return response.choices[0].message.content.strip()

In [43]:
import pandas as pd
from tqdm import tqdm
import json 


def run_pipeline(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    df = preprocess_df(df)
    
    sentiment_pipe = load_sentiment_pipeline()
    
    types, sentiment, scores, summaries, products, reasons = [], [], [], [] ,[], []

    for txt in tqdm(df['Transcript_Text']):
        sentiment_label, score = get_sentiment(sentiment_pipe, txt)
        sentiment.append(sentiment_label)
        scores.append(score)

        feedback = classify_feedback(txt)
        try:
            fb_json = json.loads(feedback)
        except:
            fb_json = {'type': 'unknown', 'reason': 'Parsing Error', 'product': 'N/A'}
        
        types.append(fb_json['type'])
        reasons.append(fb_json['reason'])
        products.append(fb_json['product'])
        
        summaries.append(summarize_call(txt))

    df['sentiment'] = sentiment
    df['sentiment_score'] = scores
    df['feedback_type'] = types
    df['feedback_reason'] = reasons
    df['feedback_product'] = products
    df['summary'] = summaries
    
    df.to_csv(output_csv, index=False)
    print(f"Annotated data saved to {output_csv}")

In [51]:
import torch

input_csv = "input_transcript.csv"
output_csv = "output_transcript.csv"

run_pipeline(input_csv, output_csv)

Device set to use mps:0
100%|██████████| 25/25 [01:27<00:00,  3.50s/it]

Annotated data saved to output_transcript.csv



