In [1]:
ls

 Volume in drive C is OS
 Volume Serial Number is 1A61-1288

 Directory of C:\Users\birad\OneDrive\Desktop\cdev3000_work

29/07/2025  02:43 PM    <DIR>          .
27/07/2025  01:55 PM    <DIR>          ..
29/07/2025  02:40 PM    <DIR>          .ipynb_checkpoints
16/04/2025  10:30 AM               302 ass2.txt
22/07/2025  07:07 PM    <DIR>          Ezy-AI
29/07/2025  02:39 PM         2,556,706 scraped_trust_pilot.json
29/07/2025  02:43 PM               617 Untitled.ipynb
               3 File(s)      2,557,625 bytes
               4 Dir(s)  272,555,335,680 bytes free


In [9]:
import json
import re
from collections import Counter

import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# 1. Load data
with open('scraped_trust_pilot.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 2. Flatten into a DataFrame
records = []
for entry in data:
    company = entry['company']
    for rev in entry.get('reviews', []):
        text = rev.get('text', '').strip()
        rating = rev.get('rating')
        if text and rating is not None:
            sentiment = 'positive' if rating >= 4 else 'negative'
            records.append({
                'company': company,
                'text': text,
                'rating': rating,
                'sentiment': sentiment
            })

df = pd.DataFrame(records)

# 3. Segmentation: count comments per app
seg_counts = df.groupby('company').size().rename('num_comments')
print("\n=== Comments per App ===")
print(seg_counts.to_frame())

# 4. Sentiment classification counts per app
class_counts = df.groupby(['company', 'sentiment']).size().unstack(fill_value=0)
print("\n=== Positive vs Negative per App ===")
print(class_counts)

# 5. Extract top words per (app, sentiment)
top_words = []
for (company, sentiment), subset in df.groupby(['company', 'sentiment']):
    all_text = " ".join(subset['text'].str.lower())
    tokens = re.findall(r'\b\w+\b', all_text)
    tokens = [t for t in tokens if t not in ENGLISH_STOP_WORDS and len(t) > 1]
    freq = Counter(tokens)
    for word, count in freq.most_common(10):
        top_words.append({
            'company': company,
            'sentiment': sentiment,
            'word': word,
            'count': count
        })

top_df = pd.DataFrame(top_words)
top_df = top_df.sort_values(['company', 'sentiment', 'count'],
                            ascending=[True, True, False])

# 6. Configure pandas to print all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("\n=== Top 10 Words by App & Sentiment ===")
print(top_df.to_string(index=False))



=== Comments per App ===
                      num_comments
company                           
ezyremit.com                    61
moneygram.com                  898
ofx.com                        958
orbitremit.com                 806
remitly.com                    847
riamoneytransfer.com           749
westernunion.com               847
wise.com                       884
worldremit.com                 896
xe.com                         351

=== Positive vs Negative per App ===
sentiment             negative  positive
company                                 
ezyremit.com                 8        53
moneygram.com              196       702
ofx.com                    160       798
orbitremit.com              30       776
remitly.com                103       744
riamoneytransfer.com       135       614
westernunion.com           212       635
wise.com                   144       740
worldremit.com             362       534
xe.com                      47       304

=== Top 10 Words by App

In [15]:
import json
import re
from collections import Counter

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

# 1. Load & flatten
with open('scraped_trust_pilot.json','r',encoding='utf-8') as f:
    data = json.load(f)

records = []
for entry in data:
    comp = entry['company']
    for rev in entry.get('reviews',[]):
        txt = rev.get('text','').strip()
        rating = rev.get('rating')
        if txt and rating is not None:
            sentiment = 'positive' if rating >= 4 else 'negative'
            records.append({'company': comp, 'text': txt, 'sentiment': sentiment})

df = pd.DataFrame(records)

# 2. Helper: get top n‑grams for a list of texts
def top_ngrams(texts, ngram_range=(2,2), top_n=10):
    vect = CountVectorizer(
        ngram_range=ngram_range,
        stop_words='english',
        token_pattern=r'\b\w+\b'
    )
    X = vect.fit_transform(texts)
    sums = X.sum(axis=0).A1
    vocab = vect.get_feature_names_out()
    freq = sorted(zip(vocab, sums), key=lambda x: x[1], reverse=True)
    return freq[:top_n]

# 3. Print top bigrams by (company, sentiment)
for (company, sentiment), group in df.groupby(['company','sentiment']):
    texts = group['text'].str.lower().tolist()
    bigrams = top_ngrams(texts, ngram_range=(2,2), top_n=20)
    print(f"\n--- {company} ({sentiment}) top 10 BIGRAMS ---")
    for phrase, cnt in bigrams:
        print(f"{phrase:30s} {cnt}")

# 4. (Optional) Show concordance for a target word
def show_concordance(texts, word, width=40, max_examples=5):
    joined = " ".join(texts).lower()
    # split into sentences
    sents = re.split(r'(?<=[.!?])\s+', joined)
    examples = [s for s in sents if re.search(rf'\b{word}\b', s)]
    print(f"\nConcordance for “{word}” (first {max_examples}):")
    for s in examples[:max_examples]:
        idx = s.lower().find(word)
        start = max(0, idx-width)
        end = min(len(s), idx+len(word)+width)
        print("…"+ s[start:end].strip() + "…")

# e.g. inspect “money” contexts in negative reviews of ezyremit.com
neg_texts = df[(df.company=='ezyremit.com') & (df.sentiment=='negative')]['text']
show_concordance(neg_texts, 'money')



--- ezyremit.com (negative) top 10 BIGRAMS ---
12 2024                        3
24 hours                       3
bank transfer                  3
received money                 3
western union                  3
12 24                          2
2 weeks                        2
23 12                          2
châu hoàng                     2
extremely slow                 2
holding money                  2
hoàng western                  2
times told                     2
11 12                          1
2 secs                         1
2024 bank                      1
2024 ezyremit                  1
2024 money                     1
24 h                           1
3 days                         1

--- ezyremit.com (positive) top 10 BIGRAMS ---
customer service               11
highly recommend               8
money transfer                 8
exchange rate                  7
good rate                      5
transfer money                 5
fast reliable                  4
good custome

In [17]:
import json
import re
from collections import Counter

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

# 1. Load & flatten
with open('scraped_trust_pilot.json','r',encoding='utf-8') as f:
    data = json.load(f)

records = []
for entry in data:
    comp = entry['company']
    for rev in entry.get('reviews',[]):
        txt = rev.get('text','').strip()
        rating = rev.get('rating')
        if txt and rating is not None:
            sentiment = 'positive' if rating >= 4 else 'negative'
            records.append({'company': comp, 'text': txt, 'sentiment': sentiment})

df = pd.DataFrame(records)

# 2. Print all negative comments by company
neg_df = df[df['sentiment'] == 'negative']
for company, group in neg_df.groupby('company'):
    print(f"\n=== {company} — Negative Comments ({len(group)}) ===")
    for i, txt in enumerate(group['text'], 1):
        print(f"{i}. {txt}")

# (Optionally continue with your n‑gram / concordance analysis…)



=== ezyremit.com — Negative Comments (8) ===
1. It has been more than 8 days, and I still haven't received my money, despite emailing and messaging for support. All EzyRemit does is tell me to wait. I will never use this company's services again. Don't trust their advertisements; be careful—they might be a scam. They are waiting for the exchange rate to rise before transferring your money.
2. Worst service. Holding my money for 7 days and the recipient still has not received the money. I will never use their service again. My friends and my family have the same problem.
3. I am disguted to see 4 star ratings here.It makes me wonder Trust Pilot is trust worthy?Does it help people with up to date information?Ezyremit takes your money and keeps it forever.My current situation as at 23.12.2024, I bank transfer  money from Australia to Viet Nam on 11.12.2024. Ezyremit confirms payment few hours later.Until today, 23.12.2024, the money is still not delivered.Ezyremit website promises 12-24 