In [3]:
import csv
import re
import os
import pandas as pd
import numpy as np  

# Đọc file data.csv

In [4]:
df = pd.read_csv("../data_csv/data_cleaned.csv", encoding="utf-8")
df.head(2)

Unnamed: 0,id,name,address,lat,lon,poi_type,avg_stars,total_reviews,crowd,offerings,atmosphere,highlights,dining_options,children,accessibility,popular_for,opening_hours
0,0f9d2009-9436-46a4-b354-b0261898a39e,The Pub Coffee - Beer & Cocktail,"18A17 Tăng Nhơn Phú, Phước Long B, Quận 9, Thà...",10.829481,106.773785,"Cafe,Bar",4.9,181,Groups,"Alcohol, Beer, Cocktails, Coffee, Hard liquor","Casual, Cozy","Great beer selection, Great coffee, Live music...",Table service,,,,"[{'day': 'Monday', 'hours': [{'start': '00:00'..."
1,02887955-963a-43ac-b0f7-355d7d7cfacf,Julieta,"C. Sta. Lucía, 9, Distrito Centro, 29008 Málag...",36.722011,-4.42178,Cafe,4.3,2053,"College students, Groups, Tourists","Alcohol, Beer, Coffee, Healthy options, Organi...","Casual, Cozy, Trendy","Great coffee, Great dessert, Great tea selection","Breakfast, Brunch, Lunch, Dessert, Seating, Ta...",Good for kids,"Wheelchair accessible entrance, Wheelchair acc...","Breakfast, Good for working on laptop","[{'day': 'Monday', 'hours': [{'start': '08:00'..."


In [5]:
df["stay_time"] = 30

# Bước 1: Tìm giá trị min-max của avg_stars

In [6]:
min_avg = df["avg_stars"].min()
max_avg = df["avg_stars"].max()
print(f"avg_stars: min={min_avg}, max={max_avg}")

avg_stars: min=3.0, max=5.0


# Bước 2: Tìm giá trị min-max của total_reviews

In [7]:
min_total_reviews = df["total_reviews"].min()
max_total_reviews = df["total_reviews"].max()
print(f"total_reviews: min={min_total_reviews}, max={max_total_reviews}")

total_reviews: min=1, max=76447


# Bước 3: Chuẩn hóa avg_stars bằng Min-Max Scaling

Công thức: `(x - min) / (max - min)`

Kết quả: giá trị trong khoảng [0, 1]

In [8]:
df["avg_stars_norm"] = ((df["avg_stars"] - min_avg) / (max_avg - min_avg)).round(3)
print("Mẫu avg_stars_norm:")
print(df[["avg_stars", "avg_stars_norm"]].head(10))

Mẫu avg_stars_norm:
   avg_stars  avg_stars_norm
0        4.9            0.95
1        4.3            0.65
2        4.5            0.75
3        4.7            0.85
4        4.8            0.90
5        4.9            0.95
6        4.7            0.85
7        5.0            1.00
8        4.3            0.65
9        4.0            0.50


# Bước 4: Chuẩn hóa total_reviews bằng Log Transform

Công thức: `log(x + 1) / log(max + 1)`

Lý do dùng log: total_reviews có phạm vi rất rộng (1 đến hàng chục nghìn), log transform giúp giảm độ chênh lệch và phân phối đều hơn.

In [9]:
df["total_reviews_norm"] = (np.log(df["total_reviews"] + 1) / np.log(max_total_reviews + 1)).round(3)
print("Mẫu total_reviews_norm:")
print(df[["total_reviews", "total_reviews_norm"]].head(10))

Mẫu total_reviews_norm:
   total_reviews  total_reviews_norm
0            181               0.463
1           2053               0.678
2            285               0.503
3            329               0.516
4            182               0.463
5            165               0.455
6            216               0.478
7            275               0.500
8           1322               0.639
9           1321               0.639


# Bước 5: Tính điểm tổng hợp normalize_stars_reviews

Công thức: `(avg_stars_norm + total_reviews_norm) / 2`

Kết quả: Trung bình cộng của 2 chỉ số đã chuẩn hóa, cho điểm đánh giá tổng hợp từ [0, 1]

In [10]:
df["normalize_stars_reviews"] = ((df["avg_stars_norm"]*0.6 + df["total_reviews_norm"]*0.4)).round(3)
print("Mẫu kết quả cuối cùng:")
print(df[["name", "avg_stars", "total_reviews", "avg_stars_norm", "total_reviews_norm", "normalize_stars_reviews"]].head(10))

Mẫu kết quả cuối cùng:
                                                name  avg_stars  \
0                   The Pub Coffee - Beer & Cocktail        4.9   
1                                            Julieta        4.3   
2                 BAIETA Restaurant Saigon Thao Dien        4.5   
3                          Parroquia San Felipe Neri        4.7   
4                                             BÀ BAR        4.8   
5                                   Art Bar Viet Nam        4.9   
6  Ngỡ Cafe | Cà phê view đẹp Thủ Đức | Coffee ng...        4.7   
7                             TEEMAY COFFEE ROASTERS        5.0   
8                       La Mafia se sienta a la mesa        4.3   
9                                         Bar tocata        4.0   

   total_reviews  avg_stars_norm  total_reviews_norm  normalize_stars_reviews  
0            181            0.95               0.463                    0.755  
1           2053            0.65               0.678                    0.661  

# Bước 6: Kiểm tra kết quả

In [11]:
# Kiểm tra công thức có đúng không
print("Kiểm tra công thức với 3 hàng đầu tiên:\n")
for i in range(3):
    avg_norm = df.iloc[i]["avg_stars_norm"]
    rev_norm = df.iloc[i]["total_reviews_norm"]
    result = df.iloc[i]["normalize_stars_reviews"]
    calculated = round((avg_norm * 0.6 + rev_norm * 0.4), 3)
    
    print(f"Hàng {i+1}: {df.iloc[i]['name'][:50]}")
    print(f"  avg_stars_norm = {avg_norm}")
    print(f"  total_reviews_norm = {rev_norm}")
    print(f"  normalize_stars_reviews = {result}")
    print(f"  Tính lại: ({avg_norm} * 0.6 + {rev_norm} * 0.4)  = {calculated}")
    print(f"  ✅ Đúng!" if result == calculated else f"  ❌ SAI!")
    print()

print(f"\nThống kê:")
print(f"  avg_stars_norm: min={df['avg_stars_norm'].min()}, max={df['avg_stars_norm'].max()}")
print(f"  total_reviews_norm: min={df['total_reviews_norm'].min()}, max={df['total_reviews_norm'].max()}")
print(f"  normalize_stars_reviews: min={df['normalize_stars_reviews'].min()}, max={df['normalize_stars_reviews'].max()}")

Kiểm tra công thức với 3 hàng đầu tiên:

Hàng 1: The Pub Coffee - Beer & Cocktail
  avg_stars_norm = 0.95
  total_reviews_norm = 0.463
  normalize_stars_reviews = 0.755
  Tính lại: (0.95 * 0.6 + 0.463 * 0.4)  = 0.755
  ✅ Đúng!

Hàng 2: Julieta
  avg_stars_norm = 0.65
  total_reviews_norm = 0.678
  normalize_stars_reviews = 0.661
  Tính lại: (0.65 * 0.6 + 0.678 * 0.4)  = 0.661
  ✅ Đúng!

Hàng 3: BAIETA Restaurant Saigon Thao Dien
  avg_stars_norm = 0.75
  total_reviews_norm = 0.503
  normalize_stars_reviews = 0.651
  Tính lại: (0.75 * 0.6 + 0.503 * 0.4)  = 0.651
  ✅ Đúng!


Thống kê:
  avg_stars_norm: min=0.0, max=1.0
  total_reviews_norm: min=0.062, max=1.0
  normalize_stars_reviews: min=0.025, max=0.939


# Phân phối của avg_stars, total_reviews, normalize_stars_reviews

In [12]:
print("=" * 80)
print("PHÂN PHỐI AVG_STARS")
print("=" * 80)
print(f"\nThống kê mô tả:")
print(df["avg_stars"].describe())

print(f"\nPhân phối theo giá trị:")
value_counts = df["avg_stars"].value_counts().sort_index()
for value, count in value_counts.items():
    percentage = (count / len(df)) * 100
    bar = "█" * int(percentage)
    print(f"{value:.1f} sao: {count:4d} ({percentage:5.2f}%) {bar}")

print(f"\nPhân nhóm:")
bins = [0, 3.0, 3.5, 4.0, 4.5, 5.0]
labels = ["= 3.0", "3.0-3.5", "3.5-4.0", "4.0-4.5", "4.5-5.0"]
df_temp = df.copy()
df_temp["avg_stars_group"] = pd.cut(df_temp["avg_stars"], bins=bins, labels=labels, include_lowest=True)
group_counts = df_temp["avg_stars_group"].value_counts().sort_index()
for group, count in group_counts.items():
    percentage = (count / len(df)) * 100
    bar = "█" * int(percentage / 2)
    print(f"{group:10s}: {count:4d} ({percentage:5.2f}%) {bar}")

PHÂN PHỐI AVG_STARS

Thống kê mô tả:
count    1454.000000
mean        4.537414
std         0.389367
min         3.000000
25%         4.300000
50%         4.600000
75%         4.800000
max         5.000000
Name: avg_stars, dtype: float64

Phân phối theo giá trị:
3.0 sao:   11 ( 0.76%) 
3.1 sao:    1 ( 0.07%) 
3.2 sao:    2 ( 0.14%) 
3.3 sao:    2 ( 0.14%) 
3.4 sao:    3 ( 0.21%) 
3.5 sao:    6 ( 0.41%) 
3.6 sao:   10 ( 0.69%) 
3.7 sao:   19 ( 1.31%) █
3.8 sao:   25 ( 1.72%) █
3.9 sao:   22 ( 1.51%) █
4.0 sao:   64 ( 4.40%) ████
4.1 sao:   62 ( 4.26%) ████
4.2 sao:   90 ( 6.19%) ██████
4.3 sao:  104 ( 7.15%) ███████
4.4 sao:  104 ( 7.15%) ███████
4.5 sao:  138 ( 9.49%) █████████
4.6 sao:  128 ( 8.80%) ████████
4.7 sao:  147 (10.11%) ██████████
4.8 sao:  162 (11.14%) ███████████
4.9 sao:  113 ( 7.77%) ███████
5.0 sao:  241 (16.57%) ████████████████

Phân nhóm:
= 3.0     :   11 ( 0.76%) 
3.0-3.5   :   14 ( 0.96%) 
3.5-4.0   :  140 ( 9.63%) ████
4.0-4.5   :  498 (34.25%) █████████████████
4

In [13]:
print("=" * 80)
print("PHÂN PHỐI TOTAL_REVIEWS")
print("=" * 80)
print(f"\nThống kê mô tả:")
print(df["total_reviews"].describe())

print(f"\nPhân nhóm (logarithmic scale):")
bins = [0, 10, 50, 100, 500, 1000, 5000, 100000]
labels = ["1-10", "11-50", "51-100", "101-500", "501-1K", "1K-5K", "5K+"]
df_temp = df.copy()
df_temp["total_reviews_group"] = pd.cut(df_temp["total_reviews"], bins=bins, labels=labels, include_lowest=True)
group_counts = df_temp["total_reviews_group"].value_counts().sort_index()
for group, count in group_counts.items():
    percentage = (count / len(df)) * 100
    bar = "█" * int(percentage / 2)
    print(f"{group:10s}: {count:4d} ({percentage:5.2f}%) {bar}")

print(f"\nTop 10 địa điểm có nhiều reviews nhất:")
top_reviews = df.nlargest(10, "total_reviews")[["name", "total_reviews", "avg_stars"]]
for idx, row in top_reviews.iterrows():
    print(f"  {row['total_reviews']:6d} reviews - {row['avg_stars']:.1f}★ - {row['name'][:60]}")

PHÂN PHỐI TOTAL_REVIEWS

Thống kê mô tả:
count     1454.000000
mean      1133.281293
std       4364.118310
min          1.000000
25%         18.000000
50%        127.500000
75%        616.500000
max      76447.000000
Name: total_reviews, dtype: float64

Phân nhóm (logarithmic scale):
1-10      :  294 (20.22%) ██████████
11-50     :  238 (16.37%) ████████
51-100    :  140 ( 9.63%) ████
101-500   :  367 (25.24%) ████████████
501-1K    :  156 (10.73%) █████
1K-5K     :  198 (13.62%) ██████
5K+       :   61 ( 4.20%) ██

Top 10 địa điểm có nhiều reviews nhất:
   76447 reviews - 4.0★ - Ben Thanh Market
   44650 reviews - 4.6★ - Mercado de Atarazanas
   43705 reviews - 4.5★ - Independence Palace
   42770 reviews - 4.5★ - Plaza Mayor
   39877 reviews - 4.5★ - War Remnants Museum
   38490 reviews - 4.5★ - Muelle Uno
   37426 reviews - 4.6★ - Alcazaba
   36338 reviews - 4.7★ - Picasso Bar Tapas
   33012 reviews - 4.3★ - Centro Comercial Larios Centro
   32909 reviews - 4.6★ - Santa Iglesia Cated

In [14]:
print("=" * 80)
print("PHÂN PHỐI NORMALIZE_STARS_REVIEWS")
print("=" * 80)
print(f"\nThống kê mô tả:")
print(df["normalize_stars_reviews"].describe())

print(f"\nPhân nhóm:")
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
labels = ["0.0-0.2", "0.2-0.4", "0.4-0.6", "0.6-0.8", "0.8-1.0"]
df_temp = df.copy()
df_temp["normalize_group"] = pd.cut(df_temp["normalize_stars_reviews"], bins=bins, labels=labels, include_lowest=True)
group_counts = df_temp["normalize_group"].value_counts().sort_index()
for group, count in group_counts.items():
    percentage = (count / len(df)) * 100
    bar = "█" * int(percentage / 2)
    print(f"{group:10s}: {count:4d} ({percentage:5.2f}%) {bar}")

print(f"\nTop 10 địa điểm có điểm chuẩn hóa cao nhất:")
top_normalized = df.nlargest(10, "normalize_stars_reviews")[["name", "avg_stars", "total_reviews", "normalize_stars_reviews"]]
for idx, row in top_normalized.iterrows():
    print(f"  {row['normalize_stars_reviews']:.3f} - {row['avg_stars']:.1f}★ ({row['total_reviews']:5d} reviews) - {row['name'][:50]}")

print(f"\nBottom 10 địa điểm có điểm chuẩn hóa thấp nhất:")
bottom_normalized = df.nsmallest(10, "normalize_stars_reviews")[["name", "avg_stars", "total_reviews", "normalize_stars_reviews"]]
for idx, row in bottom_normalized.iterrows():
    print(f"  {row['normalize_stars_reviews']:.3f} - {row['avg_stars']:.1f}★ ({row['total_reviews']:5d} reviews) - {row['name'][:50]}")

PHÂN PHỐI NORMALIZE_STARS_REVIEWS

Thống kê mô tả:
count    1454.000000
mean        0.629309
std         0.129097
min         0.025000
25%         0.576000
50%         0.649000
75%         0.711750
max         0.939000
Name: normalize_stars_reviews, dtype: float64

Phân nhóm:
0.0-0.2   :   14 ( 0.96%) 
0.2-0.4   :   69 ( 4.75%) ██
0.4-0.6   :  352 (24.21%) ████████████
0.6-0.8   :  957 (65.82%) ████████████████████████████████
0.8-1.0   :   62 ( 4.26%) ██

Top 10 địa điểm có điểm chuẩn hóa cao nhất:
  0.939 - 5.0★ (13744 reviews) - Pizza 4P's Saigon Pearl
  0.917 - 5.0★ ( 7354 reviews) - Viet Canvas - Tranh Trang Trí, Tranh Treo Tường
  0.890 - 4.8★ (18501 reviews) - Pizza 4P’s Lê Thánh Tôn
  0.884 - 4.7★ (36338 reviews) - Picasso Bar Tapas
  0.883 - 4.9★ ( 6693 reviews) - Pizza 4P's The Emporium
  0.871 - 5.0★ ( 2011 reviews) - Skywalk Climbing
  0.862 - 4.9★ ( 3723 reviews) - Chênh Vênh Rooftop
  0.861 - 4.6★ (44650 reviews) - Mercado de Atarazanas
  0.854 - 4.6★ (37426 reviews) - Al

In [15]:
print("=" * 80)
print("SO SÁNH TỔNG QUAN")
print("=" * 80)

print(f"\n{'Chỉ số':<30} {'Min':<12} {'Max':<12} {'Mean':<12} {'Median':<12}")
print("-" * 80)
print(f"{'avg_stars':<30} {df['avg_stars'].min():<12.2f} {df['avg_stars'].max():<12.2f} {df['avg_stars'].mean():<12.2f} {df['avg_stars'].median():<12.2f}")
print(f"{'total_reviews':<30} {df['total_reviews'].min():<12.0f} {df['total_reviews'].max():<12.0f} {df['total_reviews'].mean():<12.2f} {df['total_reviews'].median():<12.2f}")
print(f"{'normalize_stars_reviews':<30} {df['normalize_stars_reviews'].min():<12.3f} {df['normalize_stars_reviews'].max():<12.3f} {df['normalize_stars_reviews'].mean():<12.3f} {df['normalize_stars_reviews'].median():<12.3f}")

print(f"\n{'Phân vị':<30} {'25%':<12} {'50%':<12} {'75%':<12}")
print("-" * 80)
print(f"{'avg_stars':<30} {df['avg_stars'].quantile(0.25):<12.2f} {df['avg_stars'].quantile(0.5):<12.2f} {df['avg_stars'].quantile(0.75):<12.2f}")
print(f"{'total_reviews':<30} {df['total_reviews'].quantile(0.25):<12.0f} {df['total_reviews'].quantile(0.5):<12.0f} {df['total_reviews'].quantile(0.75):<12.0f}")
print(f"{'normalize_stars_reviews':<30} {df['normalize_stars_reviews'].quantile(0.25):<12.3f} {df['normalize_stars_reviews'].quantile(0.5):<12.3f} {df['normalize_stars_reviews'].quantile(0.75):<12.3f}")

SO SÁNH TỔNG QUAN

Chỉ số                         Min          Max          Mean         Median      
--------------------------------------------------------------------------------
avg_stars                      3.00         5.00         4.54         4.60        
total_reviews                  1            76447        1133.28      127.50      
normalize_stars_reviews        0.025        0.939        0.629        0.649       

Phân vị                        25%          50%          75%         
--------------------------------------------------------------------------------
avg_stars                      4.30         4.60         4.80        
total_reviews                  18           128          616         
normalize_stars_reviews        0.576        0.649        0.712       


# Bước 7: Xóa các cột trung gian (giữ lại dữ liệu gốc và kết quả cuối)

In [16]:
df = df.drop(columns=["avg_stars_norm", "total_reviews_norm"])
print("Cột còn lại trong DataFrame:")
print(df.columns.tolist())
print("\nMẫu dữ liệu cuối cùng:")
df.head()

Cột còn lại trong DataFrame:
['id', 'name', 'address', 'lat', 'lon', 'poi_type', 'avg_stars', 'total_reviews', 'crowd', 'offerings', 'atmosphere', 'highlights', 'dining_options', 'children', 'accessibility', 'popular_for', 'opening_hours', 'stay_time', 'normalize_stars_reviews']

Mẫu dữ liệu cuối cùng:


Unnamed: 0,id,name,address,lat,lon,poi_type,avg_stars,total_reviews,crowd,offerings,atmosphere,highlights,dining_options,children,accessibility,popular_for,opening_hours,stay_time,normalize_stars_reviews
0,0f9d2009-9436-46a4-b354-b0261898a39e,The Pub Coffee - Beer & Cocktail,"18A17 Tăng Nhơn Phú, Phước Long B, Quận 9, Thà...",10.829481,106.773785,"Cafe,Bar",4.9,181,Groups,"Alcohol, Beer, Cocktails, Coffee, Hard liquor","Casual, Cozy","Great beer selection, Great coffee, Live music...",Table service,,,,"[{'day': 'Monday', 'hours': [{'start': '00:00'...",30,0.755
1,02887955-963a-43ac-b0f7-355d7d7cfacf,Julieta,"C. Sta. Lucía, 9, Distrito Centro, 29008 Málag...",36.722011,-4.42178,Cafe,4.3,2053,"College students, Groups, Tourists","Alcohol, Beer, Coffee, Healthy options, Organi...","Casual, Cozy, Trendy","Great coffee, Great dessert, Great tea selection","Breakfast, Brunch, Lunch, Dessert, Seating, Ta...",Good for kids,"Wheelchair accessible entrance, Wheelchair acc...","Breakfast, Good for working on laptop","[{'day': 'Monday', 'hours': [{'start': '08:00'...",30,0.661
2,622c7643-30e8-4402-9b6c-b8407ff063e2,BAIETA Restaurant Saigon Thao Dien,"Nguyễn Văn Hưởng, Thảo Điền, 16/8, Thành phố H...",10.802978,106.727268,"Restaurant,Bar,Cocktail bar,Coffee shop,Family...",4.5,285,"Family-friendly, Groups, LGBTQ+ friendly, Tour...","Alcohol, Beer, Cocktails, Coffee, Happy hour d...","Casual, Cozy, Romantic","Great beer selection, Great cocktails, Great c...","Brunch, Lunch, Dinner, Catering, Counter servi...",Good for kids,Wheelchair accessible seating,"Lunch, Dinner, Solo dining","[{'day': 'Tuesday', 'hours': [{'start': '11:00...",30,0.651
3,4f06908d-e9fa-4f6a-b1ae-c7d8882e2edf,Parroquia San Felipe Neri,"C. Guerrero, 6, Distrito Centro, 29012 Málaga,...",36.725554,-4.421321,"Catholic church,Tourist attraction",4.7,329,,,,,,,Wheelchair accessible entrance,,"[{'day': 'Tuesday', 'hours': [{'start': '17:30...",30,0.716
4,279dfce3-c227-4b58-b4ed-09197327a32a,BÀ BAR,"15 Đ. Nguyễn Cừ, Thảo Điền, Quận 2, Thành phố ...",10.80319,106.728381,"Cocktail bar,Cafe",4.8,182,"Groups, LGBTQ+ friendly, Transgender safespace","Alcohol, Beer, Cocktails, Food, Happy hour dri...",Casual,"Live music, Live performances","Seating, Table service",,,,"[{'day': 'Monday', 'hours': [{'start': '10:00'...",30,0.725


In [17]:
df["normalize_stars_reviews"].max()

np.float64(0.939)

# Bước 8: Xuất dữ liệu ra file CSV

In [18]:
df.to_csv("../data_csv/data_clean_normalize.csv", index=False, encoding="utf-8")
print("✅ Đã xuất dữ liệu thành công vào: data/data_clean_normalize.csv")
print(f"Tổng số dòng: {len(df)}")
print(f"Tổng số cột: {len(df.columns)}")

✅ Đã xuất dữ liệu thành công vào: data/data_clean_normalize.csv
Tổng số dòng: 1454
Tổng số cột: 19
