In [8]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import IsolationForest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import re
from sklearn.metrics import silhouette_score

In [10]:
# Ensure NLTK resources are available
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/chehakarora/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [12]:
data_train = pd.read_csv('trainSA.csv')
data_test = pd.read_csv('testSA.csv')

In [14]:
# Display basic info
print("Dataset Overview:")
print(data_train.head())

Dataset Overview:
                                                text      category
0                     I am still waiting on my card?  card_arrival
1  What can I do if my card still hasn't arrived ...  card_arrival
2  I have been waiting over a week. Is the card s...  card_arrival
3  Can I track my card while it is in the process...  card_arrival
4  How do I know if I will get my card, or if it ...  card_arrival


In [16]:
# Preprocess text data - standardize text data gets rid of extra space, ?, !
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower().strip()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
        return text
    return ""

data_train['clean_text'] = data_train['text'].apply(preprocess_text)

In [21]:
#VADER sentiment analysis to get sentiment score 

# Sentiment Analysis - clean text is preprocessed text, VADER to analyze the sentiment. .compound will extract the overall sentiment score. 
sia = SentimentIntensityAnalyzer()
data_train['sentiment_score'] = data_train['clean_text'].apply(lambda x: sia.polarity_scores(x)['compound'])


Multilabel Binarizer converts labels into a binary matrix, each unique category will get a new feature matrix. 
text is converted to numbers through TF-IDF vectorization based on importance of the word.

In [24]:
# Convert intents into multi-label format
mlb = MultiLabelBinarizer()
category_features_train = mlb.fit_transform(data_train['category'])

In [26]:
tfidf = TfidfVectorizer(max_features=500)
text_features_train = tfidf.fit_transform(data_train['clean_text']).toarray()
# Stack text and category features horizontally
features_train = np.hstack((text_features_train, category_features_train)) #features contain both text based and category based numerical representations.
print(features_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


Isolation forest is an unsupervised anomaly detection algorithm, it separates the data by randomly partitioning the dataset. Assigns an anomaly score. 

In [43]:
# Initialize Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
iso_forest.fit(features_train)

# Fit the model and predict anomalies
data_train['anomaly'] = iso_forest.predict(features_train)

# Mark anomalies (-1 means anomaly, 1 means normal)
data_train['anomaly_flag'] = data_train['anomaly'].apply(lambda x: 'Anomalous' if x == -1 else 'Normal')

# Show flagged anomalies
print("\nAnomalous Conversations in Training data:")
print(data_train[data_train['anomaly_flag'] == 'Anomalous'][['text', 'category', 'anomaly_flag']])



Anomalous Conversations in Training data:
                                                   text  \
58    I know I'm getting a new card but would like k...   
127   Am I able to track a card that has already bee...   
137   I was issued a new card a week ago but still h...   
143   I was supposed to receive my new card by now, ...   
154   I was able to find my card. How to I go about ...   
...                                                 ...   
9407  I want to get some cash from the ATM using my ...   
9419  I just took some cash out of a cash machine in...   
9464  It seems I've suddenly been charged for my rec...   
9525  Does it cost much or take long to get a new ca...   
9565  If my card is about to expire how long would i...   

                    category anomaly_flag  
58              card_arrival    Anomalous  
127             card_arrival    Anomalous  
137             card_arrival    Anomalous  
143             card_arrival    Anomalous  
154             card_linking

In [33]:
data_test['clean_text'] = data_test['text'].apply(preprocess_text)
data_test['sentiment_score'] = data_test['clean_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [35]:
category_features_test = mlb.transform(data_test['category'])
text_features_test = tfidf.transform(data_test['clean_text']).toarray()
features_test = np.hstack((text_features_test, category_features_test))

In [37]:
data_test['anomaly'] = iso_forest.predict(features_test)
data_test['anomaly_flag'] = data_test['anomaly'].apply(lambda x: 'Anomalous' if x == -1 else 'Normal')

In [39]:
print("\nAnomalous Conversations in Test Data:")
print(data_test[data_test['anomaly_flag'] == 'Anomalous'][['text', 'category', 'anomaly_flag']])

# Save the test results
data_test.to_csv('flagged_conversations_test.csv', index=False)
print("\nAnomaly detection on test data completed! Results saved to 'flagged_conversations_test.csv'")




Anomalous Conversations in Test Data:
                                                   text  \
192   Can you help me with a weird charge?  It's a p...   
198   I'm quite confused as to what is going on. The...   
203   Hey I tried to get some money out earlier but ...   
207   I got some cash of an ATM earlier but this sho...   
210   Hi! I was wondering if you can help me. I used...   
...                                                 ...   
2745  I need to cancel my card that got stolen a lit...   
2753  I didn't take out money from an ATM but my app...   
2754  Someone has stolen my card. Even though I have...   
2884  Why are fees charged on cash withdrawals? I we...   
2972  I received my American Express card, but I am ...   

                            category anomaly_flag  
192        extra_charge_on_statement    Anomalous  
198        extra_charge_on_statement    Anomalous  
203          pending_cash_withdrawal    Anomalous  
207          pending_cash_withdrawal    Anom

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming you have the true labels and anomaly scores for the test data

# If you are using the Isolation Forest model, you can get anomaly scores like this:
# For anomaly detection, use `decision_function` or `score_samples` to get the anomaly score (lower score = more anomalous).
anomaly_scores = iso_forest.decision_function(features_test)  # Get anomaly scores for the test data

# For ROC, you need a score or probability, so use the decision function output.
# Higher score means more "normal", lower score means more "anomalous"
# The decision function output for Isolation Forest is typically negative for anomalies and positive for normal points.

# Now, let's compute the ROC curve and AUC.
fpr, tpr, thresholds = roc_curve(data_test['true_label'], anomaly_scores)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

print(f"AUC (Area Under the Curve): {roc_auc:.2f}")
