In [1]:
import pandas as pd
from io import StringIO
import chardet


In [55]:
import pandas as pd
from io import StringIO
import chardet
import ast
import numpy as np
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Detecting file encoding
file_path = 'hotel.csv'
try:
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read())
    encoding = result['encoding']
    logging.info(f"File encoding detected as:{encoding}")
except Exception as e:
    logging.error(f"Error detecting file encoding: {e}")
    encoding = 'utf-8' 

# Loading the file
try:
    with open(file_path, 'r', encoding=encoding, errors='replace') as file:
        lines = file.readlines()
    data = pd.read_csv(StringIO(''.join(lines)))
    logging.info("File loaded successfully")
except Exception as e:
    logging.error(f"Error loading file: {e}")
    raise

# Text normalization function
def normalize_text(text):
    if pd.isna(text):
        return text

    text = str(text).strip().lower()
    return text

# Apply normalization to the entire dataframe
data = data.applymap(lambda x: normalize_text(x) if isinstance(x, str) else x)


# 删除完全空的行
data.dropna(how='all', inplace=True)




# Cleaning the price column by removing non-numeric characters
data['price'] = data['price'].fillna('null').astype(str).str.replace(r'[^\d.]', '', regex=True)

# Converting price to numeric and rounding to nearest integer
data['price'] = pd.to_numeric(data['price'], errors='coerce')  
data['price'] = data['price'].round(0).astype('Int64')  




# Cleaning the comment column
data['comment'] = data['comment'].str.replace('条点评', '', regex=False).str.replace(',', '')
data['comment'] = pd.to_numeric(data['comment'], errors='coerce')
data['comment'] = data['comment'].round(0).astype('Int64')  


# Cleaning the score column
data['score'] = pd.to_numeric(data['score'], errors='coerce')
data['score'] = data['score'].round(1)


# Cleaning the tags column
def clean_tags_simple(tag_str):
    try:
        return '，'.join(tag.strip() for tag in ast.literal_eval(tag_str) if tag.strip())
    except (ValueError, SyntaxError) as e:
        logging.warning(f"Error parsing tags: {e}, returning empty string")
        return ''

data['tags'] = data['tags'].apply(clean_tags_simple)



# Handling the address column
data['address'] = data['addr         ess'].fillna('未知区域 | 未知距离')
data[['area', 'distance']] = data['address'].str.split('|', n=1, expand=True)

# Dropping the original address column
data.drop(columns=['address'], inplace=True)

# Stripping extra spaces
data['area'] = data['area'].str.strip()
data['distance'] = data['distance'].str.strip()

# Extracting numeric distance and converting to kilometers
def extract_distance(distance):
    if pd.isna(distance):
        return np.nan
    distance = distance.lower().strip()

    if '米' in distance:
        num = ''.join(filter(lambda x: x.isdigit() or x == '.', distance))  
        return round(float(num) / 1000, 1)  
    elif '公里' in distance or 'km' in distance:
        num = ''.join(filter(lambda x: x.isdigit() or x == '.', distance))  
        return round(float(num), 1) 
    elif any(char.isdigit() for char in distance):  
        num = ''.join(filter(lambda x: x.isdigit() or x == '.', distance))  
        return round(float(num) / 1000, 1)  
    else:
        return np.nan

data['distance'] = data['distance'].apply(extract_distance)



# Handling missing values
data['name'] = data['name'].fillna('Unknown Hotel')
data['price'] = data['price'].fillna(data['price'].median())  
data['score'] = data['score'].fillna(data['score'].median())  
data['distance'] = data['distance'].fillna(data['distance'].median())  
data['tags'] = data['tags'].fillna('Unknown Tags')
data['comment'] = data['comment'].fillna(0)


# delete condition conlumn
data.drop(columns=['condition'], inplace=True)

# overview
logging.info(f"success！")

# Saving the cleaned data
output_path = 'hotel_cleaned.csv'
try:
    data.to_csv(output_path, index=False, encoding='utf-8')
    logging.info(f"Data cleaning complete! The data has been saved to '{output_path}'")
except Exception as e:
    logging.error(f"Error saving file:{e}")


2024-12-10 16:52:29,468 - INFO - 文件编码为：GB2312
2024-12-10 16:52:29,476 - INFO - 文件加载成功
  data = data.applymap(lambda x: normalize_text(x) if isinstance(x, str) else x)
2024-12-10 16:52:29,505 - INFO - 数据预处理完成！
2024-12-10 16:52:29,509 - INFO - 清洗完成！数据已保存为 'hotel_cleaned.csv'


In [57]:
# 检查每列缺失值的数量
missing_values = data.isnull().sum()

# 输出每列缺失值的数量
print("每列缺失值的数量：")
print(missing_values)

# 如果想检查每列缺失值的比例，可以这样做：
missing_percentage = (data.isnull().sum() / len(data)) * 100
print("每列缺失值的比例：")
print(missing_percentage)


每列缺失值的数量：
name        0
price       0
score       0
comment     0
tags        0
area        0
distance    0
dtype: int64
每列缺失值的比例：
name        0.0
price       0.0
score       0.0
comment     0.0
tags        0.0
area        0.0
distance    0.0
dtype: float64
