In [3]:
!pip install nltk



In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re

In [8]:
nltk.download('stopwords')
nltk.download('punkt')

try:
    df = pd.read_csv('/content/calls.csv', escapechar='\\')
except pd.errors.ParserError as e:
    print(f"Error reading CSV: {e}")
    line_number = int(str(e).split('row ')[1].split(',')[0])
    print(f"Problematic line number: {line_number}")
    with open('/content/calls.csv', 'r') as f:
        for i, line in enumerate(f):
            if i == line_number - 1:
                print(f"Problematic line: {line}")
                break

def preprocess_transcripts(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in nltk.word_tokenize(text) if word not in stop_words]
    return ' '.join(tokens)

if 'call_transcript' in df.columns:
    df['processed_transcript'] = df['call_transcript'].apply(preprocess_transcripts)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['processed_transcript'])

lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(tfidf_matrix)

def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

tfidf_feature_names = vectorizer.get_feature_names_out()
display_topics(lda_model, tfidf_feature_names, 10)


Topic 0:
standby list earlier forecast weather waitlist case chance backup really
Topic 1:
tomorrow sir pm meeting delay frustration issues la apologize delayed
Topic 2:
delay refund voucher experience delays delayed missed travel hours make
Topic 3:
wanted seat check time double london upgrade chicago thanks sure
Topic 4:
change fee work date need monday day typing wednesday okay


In [10]:
import spacy
from collections import Counter

nlp = spacy.load('en_core_web_sm')

def extract_keywords(text, nlp):
    doc = nlp(text)
    keywords = [chunk.text for chunk in doc.noun_chunks]
    return keywords
df['keywords'] = df['processed_transcript'].apply(lambda x: extract_keywords(x, nlp))

keyword_counter = Counter([keyword for keywords in df['keywords'] for keyword in keywords])
print(keyword_counter.most_common(10))


[('i', 306118), ('you', 170881), ('flight', 78342), ('anything', 74649), ('agent thank', 72396), ('that', 52868), ('customer', 51902), ('today customer', 40429), ('agent', 39092), ('united airlines', 38967)]


In [12]:
df = pd.read_csv('/content/df_final.csv')

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['agent_tone_encoded'] = le.fit_transform(df['agent_tone'])
df['customer_tone_encoded'] = le.fit_transform(df['customer_tone'])

negative_tone = df[(df['average_sentiment'] < 0) & (df['customer_tone_encoded'] > 2)]
print(negative_tone['primary_call_reason'].value_counts())


primary_call_reason
Flight Changes                  6081
Loyalty Program & Membership    1992
Seat Preferences & Upgrades     1629
Post-Flight Issues              1557
Booking and Boarding            1380
Customer Support                1235
Other                            359
Special Services                  66
Name: count, dtype: int64
