In [2]:
import pandas as pd
from collections import Counter

In [3]:
df = pd.read_csv("text_classification_data.csv")

In [4]:
df.head()

Unnamed: 0,platform,region,feedback_text,clean_feedback_text,issue_category,star_rating,review_source,urgency,word_count
0,revolut,unknown,getting fed up of the random over reach by sca...,getting fed up of the random over reach by sca...,kyc,2,playstore,Medium,30
1,revolut,unknown,when I urgently needed to send money immediate...,when i urgently needed to send money immediate...,transaction_issues,1,playstore,High,46
2,revolut,unknown,I cannot set to be asked for my permission on ...,i cannot set to be asked for my permission on ...,account_access_&_security_issues,1,playstore,High,25
3,revolut,unknown,8 duplicate transactions from 3 merchants in 1...,8 duplicate transactions from 3 merchants in 1...,cards_&_payment_instruments,1,playstore,High,90
4,revolut,unknown,3 times now someone has tried to fraudulently ...,3 times now someone has tried to fraudulently ...,account_access_&_security_issues,1,playstore,High,53


In [5]:
category = "transaction_issues"

subset = df[df["issue_category"] == category].copy()

subset = subset[subset["clean_feedback_text"].notna()]

subset["clean_feedback_text"] = (
    subset["clean_feedback_text"]
    .str.lower()
    .str.replace(r"[^\w\s]", "", regex=True)
)

words = subset["clean_feedback_text"].str.split().explode()

words = words[words.str.len() > 2]

unigram_counts = (
    words
    .value_counts()
    .reset_index()
    
)

unigram_counts.columns = ["word", "count"]

unigram_counts.head(20)

Unnamed: 0,word,count
0,the,86
1,and,58
2,money,43
3,you,38
4,this,36
5,app,34
6,they,31
7,for,31
8,but,27
9,with,27


In [6]:
stopwords = {
    "the", "and", "for", "wish", "this", "that", "you", "your", 
    "have", "not", "are", "was", "but", "from", "they"
}

unigram_counts = unigram_counts[
    ~unigram_counts["word"].isin(stopwords)
]

unigram_counts.head(20)

Unnamed: 0,word,count
2,money,43
5,app,34
9,with,27
10,transfer,27
12,account,25
13,bank,23
16,days,17
17,service,17
19,its,15
20,customer,15


In [7]:
top_categories = (
    df["issue_category"]
    .value_counts()
    .head(4)
    .index
    .tolist()
)

top_categories

['product_&_feature_feedback',
 'transaction_issues',
 'account_access_&_security_issues',
 'kyc']

In [9]:
def get_unigrams_by_category(df, category, stopwords, top_n=20):
    subset = df[df["issue_category"] == category].copy()
    subset = subset[subset["clean_feedback_text"].notna()]

    words = (
        subset["clean_feedback_text"]
        .str.lower()
        .str.replace(r"[^\w\s]", "", regex=True)
        .str.split()
        .explode()
    )

    words = words[words.str.len() > 2]

    unigram_counts = (
        words
        .value_counts()
        .reset_index()
    )

    unigram_counts.columns = ["word", "count"]

    unigram_counts = unigram_counts[
        ~unigram_counts["word"].isin(stopwords)
    ]

    return unigram_counts.head(top_n)


In [10]:
stopwords = {
    "the", "and", "for", "this", "that", "you", "your",
    "have", "not", "are", "was", "but", "from", "they"
}

for category in top_categories:
    print(f"\nTop words for: {category}")
    display(get_unigrams_by_category(df, category, stopwords))



Top words for: product_&_feature_feedback


Unnamed: 0,word,count
3,app,63
11,account,37
12,with,34
13,its,34
14,money,30
16,when,27
17,just,26
21,now,24
22,monzo,24
23,more,23



Top words for: transaction_issues


Unnamed: 0,word,count
2,money,43
5,app,34
9,with,27
10,transfer,27
12,account,25
13,bank,23
16,days,17
17,service,17
19,its,15
20,customer,15



Top words for: account_access_&_security_issues


Unnamed: 0,word,count
2,account,54
9,app,26
13,money,22
15,when,17
16,now,16
17,after,16
18,bank,16
19,card,16
21,has,15
22,with,15



Top words for: kyc


Unnamed: 0,word,count
3,account,18
6,just,13
7,app,11
10,after,9
11,all,9
14,with,9
15,even,8
18,identity,7
19,verification,7
20,dont,7


In [11]:
category_unigrams = {}

for category in top_categories:
    category_unigrams[category] = get_unigrams_by_category(
        df, category, stopwords
    )


In [14]:
all_unigrams = []

for category in top_categories:
    temp = get_unigrams_by_category(df, category, stopwords)
    temp["issue_category"] = category
    all_unigrams.append(temp)

unigram_df = pd.concat(all_unigrams, ignore_index=True)


unigram_df

Unnamed: 0,word,count,issue_category
0,app,63,product_&_feature_feedback
1,account,37,product_&_feature_feedback
2,with,34,product_&_feature_feedback
3,its,34,product_&_feature_feedback
4,money,30,product_&_feature_feedback
...,...,...,...
75,number,6,kyc
76,again,6,kyc
77,had,6,kyc
78,information,6,kyc
