In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import IsolationForest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import re

In [3]:
# Ensure NLTK resources are available
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/chehakarora/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
data = pd.read_csv('trainSA.csv')

In [5]:
# Display basic info
print("Dataset Overview:")
print(data.head())

Dataset Overview:
                                                text      category
0                     I am still waiting on my card?  card_arrival
1  What can I do if my card still hasn't arrived ...  card_arrival
2  I have been waiting over a week. Is the card s...  card_arrival
3  Can I track my card while it is in the process...  card_arrival
4  How do I know if I will get my card, or if it ...  card_arrival


In [6]:
# Preprocess text data - standardize text data gets rid of extra space, ?, !
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower().strip()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
        return text
    return ""

data['clean_text'] = data['text'].apply(preprocess_text)

In [None]:
#VADER sentiment analysis to get sentiment score 

# Sentiment Analysis - clean text is preprocessed text, VADER to analyze the sentiment. .compound will extract the overall sentiment score. 
data['sentiment_score'] = data['clean_text'].apply(lambda x: sia.polarity_scores(x)['compound'])


Multilabel Binarizer converts labels into a binary matrix, each unique category will get a new feature matrix. 
text is converted to numbers through TF-IDF vectorization based on importance of the word.

In [26]:
# Convert intents into multi-label format
mlb = MultiLabelBinarizer()
category_features = mlb.fit_transform(data['category'])

In [30]:
tfidf = TfidfVectorizer(max_features=500)
text_features = tfidf.fit_transform(data['clean_text']).toarray()
# Stack text and category features horizontally
features = np.hstack((text_features, category_features)) #features contain both text based and category based numerical representations.
print(features)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Isolation forest is an unsupervised anomaly detection algorithm, it separates the data by randomly partitioning the dataset. Assigns an anomaly score. 

In [34]:
# Initialize Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)

# Fit the model and predict anomalies
data['anomaly'] = iso_forest.fit_predict(features)

# Mark anomalies (-1 means anomaly, 1 means normal)
data['anomaly_flag'] = data['anomaly'].apply(lambda x: 'Anomalous' if x == -1 else 'Normal')

# Show flagged anomalies
print("\nAnomalous Conversations:")
print(data[data['anomaly_flag'] == 'Anomalous'][['text', 'category', 'anomaly_flag']])



Anomalous Conversations:
                                                   text  \
19    It's been a week since you issued me a card an...   
159         Is there any way to see my card in the app?   
170   I found my lost card in my jacket pocket this ...   
173   I would like to re-activate my card, it was pr...   
195    I found my lost card. Am I still able to use it?   
...                                                 ...   
9486  I used my card the other day to get cash from ...   
9598  I want to get a new card before my current one...   
9709  I have received my American express in apple p...   
9756  i have been trying to add money with apple pay...   
9996  I live in the US and I would like to get a car...   

                       category anomaly_flag  
19               [card_arrival]    Anomalous  
159              [card_linking]    Anomalous  
170              [card_linking]    Anomalous  
173              [card_linking]    Anomalous  
195              [card_linking

In [36]:
data.to_csv('flagged_conversations.csv', index=False)




Anomaly detection completed! Results saved to 'flagged_conversations.csv'
