In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
input_file = r"C:\Users\ghc_l\Desktop\43final\housedata.csv"
output_file = r"C:\Users\ghc_l\Desktop\43final\datause.csv"

In [3]:
# 读取原始数据
data = pd.read_csv(input_file, encoding='utf-8', header=None)
data = data[0].str.split(",", expand=True)

In [4]:
# 重置列名
data.columns = ["Community_Name", "price", "area", "layout", "site", "total_height", "toward", "built_year"]
data = data.iloc[1:]

In [5]:
# 定义数值化
site_map = {"顶": 4, "高": 3, "中": 2, "低": 1, "底": 0}
toward_map = {"东": 1, "南": 2, "西": 3, "北": 4, "东南": 5, "西南": 6, "东北": 7, "西北": 8, "南北": 9, "东西": 10}
layout_map = {f"{i}室{j}厅": i * 100 + j for i in range(0, 11) for j in range(0, 11)}

In [6]:
# 数值化处理
data["layout"] = data["layout"].map(layout_map)
data["site"] = data["site"].map(site_map)
data["toward"] = data["toward"].map(toward_map)

In [7]:
# 转换数值型列
data["price"] = pd.to_numeric(data["price"], errors='coerce')
data["area"] = pd.to_numeric(data["area"], errors='coerce')
data["total_height"] = pd.to_numeric(data["total_height"], errors='coerce')

In [8]:
# 扩展
data["built_year"] = pd.to_datetime(data["built_year"], errors='coerce').dt.year
current_year = datetime.now().year
data["age"] = current_year - data["built_year"]
data["price_per_sqm"] = data["price"] / data["area"]

In [9]:
# 保存
data.to_csv(output_file, index=False, encoding='utf-8-sig')

In [10]:
# 显示数据基本信息
print("\n处理后的数据预览：")
print(data.head())
print("\n数据基本信息：")
print(data.info())


处理后的数据预览：
  Community_Name  price   area  layout  site  total_height  toward  \
1         中海寰宇天下  279.0   86.0     302     2          22.0       2   
2         华发四季峰景  230.0   89.0     302     3          32.0       9   
3         恒荣城市溪谷  298.0  118.0     402     1          37.0       9   
4         正方南湾首府  235.0   99.0     402     2          32.0       9   
5           中安广场   97.0   61.0     102     2          26.0       2   

   built_year  age  price_per_sqm  
1        2020    4       3.244186  
2        2020    4       2.584270  
3        2019    5       2.525424  
4        2013   11       2.373737  
5        2019    5       1.590164  

数据基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2541 entries, 1 to 2541
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Community_Name  2541 non-null   object 
 1   price           2541 non-null   float64
 2   area            2541 non-null   float64
 3   layout