# 北京房源二手房房价预测

## 项目背景
提高成单量  
促成售卖，促成租赁

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
house = pd.read_csv('./二手房数据/house.csv')
community = pd.read_csv('./二手房数据/community_describe.csv')

## 数据融合

In [3]:
data = house.merge(community, on='community', how='left')

## 数据提取

In [4]:
#当前层
data['当前层'] = data.years.str.extract('(\w*?)\(')
#总楼层
data['总楼层'] = data.years.str.extract('共(\d+)层')
#建成年份
data['建成年份'] = data.years.str.extract('\)(\d+)年')
#建筑结构
data['建筑结构'] = data.years.str.extract('建(\w+)')

del data['years']
del data['floor']

### 在housetype中删除车位和别墅的数据。提取客厅数，房间数，卫生间数

In [5]:
data = data[~data.housetype.str.contains('别|车')]

#卧室
data['卧室'] = data.housetype.str.extract('(\d+)室|房')
#客厅
data['客厅'] = data.housetype.str.extract('(\d+)厅')
#卫生间
data['卫生间'] = data.housetype.str.extract('(\d+)卫')

del data['housetype']

### 去掉square中的平米

In [6]:
data['square'] = data.square.str.replace('平米', '')

### 在taxtype中提取地铁距离，房本类型

In [7]:
data['地铁距离'] = data.taxtype.str.extract('站(\d+)')
data['房本类型'] = data.taxtype.str.extract('满(\w+)年')

del data['taxtype']

### 在tagList中提取地铁站

In [8]:
data['地铁站'] = data.tagList.str.extract('线(\w+)')

del data['tagList']

### 删除没用的列

In [9]:
data = data.drop(['index_x', 'title', 'totalPrice', 'followInfo', 'index_y', 'id', 'onsale'], axis=1)

### 将数值类型字符串转为数值类型

In [10]:
data['square'] = data['square'].astype('float64')
data['总楼层'] = data['总楼层'].astype('float64')
data['建成年份'] = data['建成年份'].astype('float64')
data['卧室'] = data['卧室'].astype('float64')
data['客厅'] = data['客厅'].astype('float64')
data['卫生间'] = data['卫生间'].astype('float64')
data['地铁距离'] = data['地铁距离'].astype('float64')

### 处理异常值

In [11]:
data = data[data.unitPrice >= 30000]

data = data[data['总楼层'] < 40]

data = data[data['卧室'] < 5]

data = data[data['客厅'] <= 2]

del data['卫生间']

In [12]:
data = data[data['当前层'] != '地下室']

data = data[data['建筑结构'] != '平房']

### 缺失值处理

In [13]:
#district缺失值删掉

In [14]:
data = data[~data.district.isna()]

In [15]:
#地铁距离的缺失值用最大值填充
data['地铁距离'][data.地铁距离.isna()] = data.地铁距离.max()

In [16]:
# 房本类型缺失值，缺少认为不满2年用1填充
data['房本类型'][data.房本类型.isna()] = 1

In [17]:
# 地铁站填充无
data['地铁站'][data.地铁站.isna()] = '无'

In [18]:
#建成年份缺失值，使用同一个小区的众数年份进行填充，填充失败的直接删除
def getyears(item):
    val = data.建成年份[data.community == item].mode()
    if val.size > 0:
        return val[0]
    else:
        return np.nan


data.建成年份[data.建成年份.isna()] = data.community[data.建成年份.isna()].apply(getyears)
#填充失败直接删掉
data.dropna(subset=['建成年份'], inplace=True)

In [19]:
#建筑结构缺失值，使用同一个小区的众数年份进行填充，填充失败的直接删除
def getyears(item):
    val = data.建筑结构[data.community == item].mode()
    if val.size > 0:
        return val[0]
    else:
        return np.nan


data.建筑结构[data.建筑结构.isna()] = data.community[data.建筑结构.isna()].apply(getyears)
#填充失败直接删掉
data.dropna(subset=['建筑结构'], inplace=True)

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14158 entries, 0 to 16115
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   community  14158 non-null  object 
 1   square     14158 non-null  float64
 2   unitPrice  14158 non-null  int64  
 3   bizcircle  14158 non-null  object 
 4   当前层        14158 non-null  object 
 5   总楼层        14158 non-null  float64
 6   建成年份       14158 non-null  float64
 7   建筑结构       14158 non-null  object 
 8   卧室         14158 non-null  float64
 9   客厅         14158 non-null  float64
 10  地铁距离       14158 non-null  float64
 11  房本类型       14158 non-null  object 
 12  地铁站        14158 non-null  object 
 13  东城         14158 non-null  uint8  
 14  朝阳         14158 non-null  uint8  
 15  海淀         14158 non-null  uint8  
 16  西城         14158 non-null  uint8  
dtypes: float64(6), int64(1), object(6), uint8(4)
memory usage: 1.6+ MB


### 编码

In [24]:
dis_onehot = pd.get_dummies(data.district)
dis_onehot

Unnamed: 0,东城,朝阳,海淀,西城
0,0,1,0,0
2,0,0,1,0
3,0,1,0,0
5,0,0,0,1
9,0,0,1,0
...,...,...,...,...
16109,0,1,0,0
16110,1,0,0,0
16112,0,1,0,0
16114,0,0,1,0


In [25]:
data = pd.concat((data, dis_onehot), axis=1)

In [27]:
del data['district']

In [31]:
# 当前层
data.当前层.unique()

array(['底层', '低楼层', '中楼层', '顶层', '高楼层'], dtype=object)

In [32]:
dic_floor = {'底层': 0, '低楼层': 1, '中楼层': 2, '顶层': 3, '高楼层': 4}

In [33]:
data['当前层'] = data.当前层.map(dic_floor)

In [None]:
#建筑结构

In [34]:
data.建筑结构.unique()

array(['板塔结合', '塔楼', '板楼'], dtype=object)

In [35]:
dic_build = {'板塔结合': 0, '塔楼': 1, '板楼': 2}

In [36]:
data['建筑结构'] = data.建筑结构.map(dic_build)

In [37]:
data.房本类型.unique()

array(['五', '两', 1], dtype=object)

In [38]:
dic_housebook = {'五': 0, '两': 1, '1': 2}

In [39]:
data['房本类型'] = data.房本类型.map(dic_housebook)

In [46]:
#街道
dic_biz = dict(data.bizcircle.value_counts())

In [47]:
data['bizcircle'] = data.bizcircle.map(dic_biz)

In [50]:
#地铁站
data.地铁站.nunique()

158

In [52]:
dic_sub = dict(data.groupby('地铁站')['unitPrice'].mean())

In [53]:
data['地铁站'] = data.地铁站.map(dic_sub)

In [54]:
dic_com = dict(data.groupby('community')['unitPrice'].mean())

In [55]:
data['community'] = data.community.map(dic_com)

In [56]:
data

Unnamed: 0,community,square,unitPrice,bizcircle,当前层,总楼层,建成年份,建筑结构,卧室,客厅,地铁距离,房本类型,地铁站,东城,朝阳,海淀,西城
0,82448.333333,298.79,86951,845,0,22.0,2010.0,0,4.0,1.0,680.0,0.0,83488.901639,0,1,0,0
2,69571.800000,177.36,67659,157,1,24.0,1999.0,1,3.0,2.0,383.0,0.0,92364.456250,0,0,1,0
3,98762.580645,245.52,67205,158,2,28.0,2007.0,1,4.0,2.0,1153.0,0.0,82526.081146,0,1,0,0
5,82289.500000,130.03,89211,23,1,22.0,2002.0,1,3.0,1.0,397.0,0.0,88249.428571,0,0,0,1
9,86705.250000,256.11,76140,99,2,14.0,2000.0,0,4.0,2.0,887.0,0.0,99153.531915,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16109,78664.000000,98.92,79863,221,1,18.0,2004.0,1,2.0,2.0,1199.0,0.0,74791.343572,0,1,0,0
16110,68721.666667,82.24,70526,77,2,22.0,1999.0,1,2.0,1.0,749.0,0.0,74791.343572,1,0,0,0
16112,64601.828571,153.28,55455,845,4,22.0,2000.0,1,3.0,2.0,842.0,0.0,69263.933628,0,1,0,0
16114,85661.800000,63.10,95088,140,2,16.0,1992.0,1,2.0,2.0,880.0,0.0,108395.860248,0,0,1,0


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14158 entries, 0 to 16115
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   community  14158 non-null  float64
 1   square     14158 non-null  float64
 2   unitPrice  14158 non-null  int64  
 3   bizcircle  14158 non-null  int64  
 4   当前层        14158 non-null  int64  
 5   总楼层        14158 non-null  float64
 6   建成年份       14158 non-null  float64
 7   建筑结构       14158 non-null  int64  
 8   卧室         14158 non-null  float64
 9   客厅         14158 non-null  float64
 10  地铁距离       14158 non-null  float64
 11  房本类型       11872 non-null  float64
 12  地铁站        14158 non-null  float64
 13  东城         14158 non-null  uint8  
 14  朝阳         14158 non-null  uint8  
 15  海淀         14158 non-null  uint8  
 16  西城         14158 non-null  uint8  
dtypes: float64(9), int64(4), uint8(4)
memory usage: 1.6 MB


In [61]:
#调整数据顺序
data['y'] = data['unitPrice']
del data['unitPrice']

In [62]:
data

Unnamed: 0,community,square,bizcircle,当前层,总楼层,建成年份,建筑结构,卧室,客厅,地铁距离,房本类型,地铁站,东城,朝阳,海淀,西城,y
0,82448.333333,298.79,845,0,22.0,2010.0,0,4.0,1.0,680.0,0.0,83488.901639,0,1,0,0,86951
2,69571.800000,177.36,157,1,24.0,1999.0,1,3.0,2.0,383.0,0.0,92364.456250,0,0,1,0,67659
3,98762.580645,245.52,158,2,28.0,2007.0,1,4.0,2.0,1153.0,0.0,82526.081146,0,1,0,0,67205
5,82289.500000,130.03,23,1,22.0,2002.0,1,3.0,1.0,397.0,0.0,88249.428571,0,0,0,1,89211
9,86705.250000,256.11,99,2,14.0,2000.0,0,4.0,2.0,887.0,0.0,99153.531915,0,0,1,0,76140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16109,78664.000000,98.92,221,1,18.0,2004.0,1,2.0,2.0,1199.0,0.0,74791.343572,0,1,0,0,79863
16110,68721.666667,82.24,77,2,22.0,1999.0,1,2.0,1.0,749.0,0.0,74791.343572,1,0,0,0,70526
16112,64601.828571,153.28,845,4,22.0,2000.0,1,3.0,2.0,842.0,0.0,69263.933628,0,1,0,0,55455
16114,85661.800000,63.10,140,2,16.0,1992.0,1,2.0,2.0,880.0,0.0,108395.860248,0,0,1,0,95088


In [None]:
## 建模

In [63]:
x = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [65]:
import sklearn.model_selection as ms

In [66]:
train_x, test_x, train_y, test_y = ms.train_test_split(x, y, test_size=0.1, random_state=7)

In [20]:
# 回归模型，多元线性回归试试

In [20]:
def select_model(name, model):
    pass


dic_model = {'name': 模型}

for name, obj in dic_model.item():
    select_model(name, obj)