In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
sns.set_theme(style='whitegrid',palette='deep', font='Taipei Sans TC Beta', font_scale=1, rc={'figure.dpi':120})
%matplotlib inline
data = pd.read_csv('台北房價107-112原始.csv')

In [None]:
# Colab 進行matplotlib繪圖時顯示繁體中文
# 下載台北思源黑體並命名taipei_sans_tc_beta.ttf，移至指定路徑
!wget -O TaipeiSansTCBeta-Regular.ttf https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import fontManager
fontManager.addfont('TaipeiSansTCBeta-Regular.ttf')
mpl.rc('font', family='Taipei Sans TC Beta')

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['WenQuanYi Micro Hei']

In [None]:
data = data.drop(['經度', '緯度'], axis=1)

In [None]:
data = data[['屋齡', '面積', '建物類型', '用途', '樓別/樓高', '佈局', '電梯', '管理員',
'車位數量', '行政區', '交易年份','價格']]

In [None]:
#補值前 資料裡有一些數字比較大 有用到逗號 要先把逗號去掉
data['價格'] = data['價格'].replace(',', '', regex=True).astype(float)
data['面積'] = data['面積'].replace(',', '', regex=True).astype(float)

In [None]:
data = data.drop_duplicates()

In [None]:
data['建物類型'].fillna('住宅大樓(11層含以上有電梯)',inplace=True)
data['用途'].fillna('住家用',inplace=True)
data.dropna(subset=['佈局'],inplace=True)

In [None]:
s = data['價格'].describe()
IQR = s['75%'] - s['25%']
upper_lim = s['75%'] + IQR*1.5
lower_lim = s['25%'] - IQR*1.5
data = data[data['價格'] < upper_lim]

In [None]:
s = data['面積'].describe()
IQR = s['75%'] - s['25%']
upper_lim = s['75%'] + IQR*1.5
lower_lim = s['25%'] - IQR*1.5
data = data[data['面積'] < upper_lim]

In [None]:
plt.hist(data['價格']/10000, bins=20, color='steelblue', edgecolor='black')
plt.title('台北市八大行政區房價分佈(交易時間107/12/01-112/12/01)')
plt.xlabel('價格(萬)')
plt.ylabel('數量')

In [None]:
df = data.sort_values(by='交易年份',ascending=True)
df['價格']=df['價格']/10000
sns.relplot(x='交易年份', y='價格', hue='行政區', data=df,kind='line',ci=None)
plt.xlabel('年度')
plt.ylabel('價格(萬)')

In [None]:
data.loc[data['建物類型'].isin(['工廠','廠辦','農舍','倉庫']), '建物類型'] = '其他'
data['建物類型'] = data['建物類型'].str.split("\(",expand=True)[0]
data.loc[data['建物類型'].isin(['店面（店舖)']), '建物類型'] = '店面'

In [None]:
cross_table = pd.crosstab(data['行政區'],data['建物類型'])
plt.figure(figsize=(12,8))
sns.heatmap(cross_table,cmap='viridis',annot=True,fmt='d')

In [None]:
data.loc[data['用途'].isin(['住家用\)']), '用途'] = '住家用'
data.loc[~data['用途'].isin(['住家用','住商用','商業用','辦公用','其他']), '用途'] = '其他'

In [None]:
data['樓別'] = data['樓別/樓高'].str.split('/',expand=True)[0] #樓別/樓高
data['樓高'] = data['樓別/樓高'].str.split('/',expand=True)[1] #樓別/樓高
data.drop(['樓別/樓高'],axis=1,inplace=True)

In [None]:
print('處理前一共有',len(data.樓別.unique()),'不同的資料')
data['樓別'] = data['樓別'].str.split(',', expand=True)[0]
print('處理後一共有',len(data.樓別.unique()),'不同的資料')

In [None]:
data.loc[data['樓別'].isin(['見其他登記事項','屋頂突出物','夾層','地下','','騎樓']), '樓別'] = '其他'
data['樓別'] = data['樓別'].fillna('NA')

data['樓別'] = data['樓別'].apply(
    lambda x: '地下' if '地下' in x
    else '其他' if x in ['其他', 'NA']
    else '全層' if '全' in x
    else '低層' if x in ['一層', '二層', '三層', '四層', '五層']
    else '中層' if x in ['六層', '七層', '八層', '九層', '十層', '十一層', '十二層', '十三層', '十四層', '十五層']
    else '高層'
)

###可以跑

In [None]:
data['樓別'].value_counts()

In [None]:
data.loc[data['樓高'].isin(['(空白)']), '樓高'] = '十五層' # 填補眾數
data['樓高'] = data['樓高'].apply(
lambda x:  '低層' if x in ['一層', '二層', '三層', '四層', '五層']
else '中層' if x in ['六層', '七層', '八層', '九層', '十層', '十一層', '十二層', '十三層', '十四層', '十五層']
else '高層')

In [None]:
data['房'] = data['佈局'].str.split('房',expand=True)[0]
data['廳'] = data['佈局'].str.split('房',expand=True)[1].str.split('廳',expand=True)[0]
data['衛'] = data['佈局'].str.split('房',expand=True)[1].str.split('廳',expand=True)[1].str.split('衛',expand=True)[0]
data.drop('佈局',axis=1,inplace=True)

In [None]:
data['房'] = pd.to_numeric(data['房'].str.extract('(\d+)', expand=False), errors='coerce').astype(float)
data['廳'] = pd.to_numeric(data['廳'].str.extract('(\d+)', expand=False), errors='coerce').astype(float)
data['衛'] = pd.to_numeric(data['衛'].str.extract('(\d+)', expand=False), errors='coerce').astype(float)

In [None]:
# Replace NaN values in '房' column with 0
data['房'].fillna(0, inplace=True)

# Replace NaN values in '廳' column with 0
data['廳'].fillna(0, inplace=True)

# Replace NaN values in '衛' column with 0
data['衛'].fillna(0, inplace=True)

In [None]:
data.info()

In [None]:
# One-Hot Encoding
data_encoded = pd.get_dummies(data, columns=['建物類型','用途','行政區'])
# Label Encoding
code_dict = {'地下': 0, '其他': 1, '低層': 2,'中層':3,'高層':4,'全層':5}
data_encoded['樓別'] = data_encoded['樓別'].map(code_dict)
code_dict = {'低層': 0, '中層': 1, '高層': 2}
data_encoded['樓高'] = data_encoded['樓高'].map(code_dict)
code_dict = {'無':0, '有':1}
data_encoded['電梯'] = data_encoded['電梯'].map(code_dict)
data_encoded['管理員'] = data_encoded['管理員'].map(code_dict)

In [None]:
data_encoded['電梯'] = data_encoded['電梯'].fillna(0)

In [None]:
data_encoded = data_encoded.drop(['建物類型_其他','用途_其他'],axis=1)

In [None]:
# 先將屋齡是空的和不是空的分開
Age_Na = data_encoded[data_encoded["屋齡"].isnull()]
Age_not_Na = data_encoded[data_encoded["屋齡"].notnull()]

In [None]:
# 找出特徵變數 X 和 目標變數 y
y = Age_not_Na['屋齡']
X = Age_not_Na.drop('屋齡',axis=1)

In [None]:
X.fillna(0, inplace=True)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
rfModel_age = RandomForestRegressor(n_estimators=100,random_state=42)
rfModel_age.fit(X_train, y_train)

In [None]:
# 預測測試集
from sklearn.metrics import r2_score
y_pred =rfModel_age.predict(X_test)
r2 = r2_score(y_test, y_pred)
print('R方得分:', r2)

R方得分: 0.783846878336012


In [None]:
Age_Na.fillna(0, inplace=True)

In [None]:
#預測屋齡空值
Age_Na.drop('屋齡',axis=1,inplace=True)
Age_Na_value = rfModel_age.predict(Age_Na)
Age_Na['屋齡'] = Age_Na_value
#整合資料
data = pd.concat([Age_not_Na,Age_Na],axis=0)
data.sort_index(inplace=True)

In [None]:
# Replace NaN values in '房' column with 0
data['房'].fillna(0, inplace=True)

# Replace NaN values in '廳' column with 0
data['廳'].fillna(0, inplace=True)

# Replace NaN values in '衛' column with 0
data['衛'].fillna(0, inplace=True)

In [None]:
y = data['價格']
X = data.drop('價格',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
rfModel = RandomForestRegressor(n_estimators=100,random_state=42)
rfModel.fit(X_train, y_train)

y_pred =rfModel.predict(X_test)
r2 = r2_score(y_test, y_pred)
print('R方得分:', r2)

R方得分: 0.8053031070492032


In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE: ",round(rmse/10000),'萬')

RMSE:  420 萬


In [None]:
from sklearn.model_selection import cross_val_score
rfModel_cv = RandomForestRegressor(n_estimators=100, random_state=42)
scores = cross_val_score(rfModel_cv, X, y, cv=5, scoring='r2')
print("隨機森林迴歸 R2 score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
X.columns

In [None]:
price_pred = rfModel.predict(test)
print("預估價格為: %0.2f萬" % (price_pred/10000))
print("誤差範圍介於: %0.2f 萬 到 %0.2f 萬元之間" % (price_pred/10000-rmse/10000,price_pred/10000+rmse/10000))

###測試

In [None]:
test_data = pd.read_csv('591_data new -test.csv')

In [None]:
price_pred = rfModel.predict(test_data)
test_data['Predicted_Price'] = price_pred
test_data.to_csv('output_with_predictions107-112.csv', index=False)

###全部

In [None]:
# Colab 進行matplotlib繪圖時顯示繁體中文
# 下載台北思源黑體並命名taipei_sans_tc_beta.ttf，移至指定路徑
!wget -O TaipeiSansTCBeta-Regular.ttf https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import fontManager
fontManager.addfont('TaipeiSansTCBeta-Regular.ttf')
mpl.rc('font', family='Taipei Sans TC Beta')

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
sns.set_theme(style='whitegrid',palette='deep', font='Taipei Sans TC Beta', font_scale=1, rc={'figure.dpi':120})
%matplotlib inline
data = pd.read_csv('台北房價107-112原始.csv')
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['WenQuanYi Micro Hei']
data = data.drop(['經度', '緯度'], axis=1)
data = data[['屋齡', '面積', '建物類型', '用途', '樓別/樓高', '佈局', '電梯', '管理員',
'車位數量', '行政區', '交易年份','價格']]
#補值前 資料裡有一些數字比較大 有用到逗號 要先把逗號去掉
data['價格'] = data['價格'].replace(',', '', regex=True).astype(float)
data['面積'] = data['面積'].replace(',', '', regex=True).astype(float)
data = data.drop_duplicates()
data['建物類型'].fillna('住宅大樓(11層含以上有電梯)',inplace=True)
data['用途'].fillna('住家用',inplace=True)
data.dropna(subset=['佈局'],inplace=True)
s = data['價格'].describe()
IQR = s['75%'] - s['25%']
upper_lim = s['75%'] + IQR*1.5
lower_lim = s['25%'] - IQR*1.5
data = data[data['價格'] < upper_lim]
s = data['面積'].describe()
IQR = s['75%'] - s['25%']
upper_lim = s['75%'] + IQR*1.5
lower_lim = s['25%'] - IQR*1.5
data = data[data['面積'] < upper_lim]
data.loc[data['建物類型'].isin(['工廠','廠辦','農舍','倉庫']), '建物類型'] = '其他'
data['建物類型'] = data['建物類型'].str.split("\(",expand=True)[0]
data.loc[data['建物類型'].isin(['店面（店舖)']), '建物類型'] = '店面'
data.loc[data['用途'].isin(['住家用\)']), '用途'] = '住家用'
data.loc[~data['用途'].isin(['住家用','住商用','商業用','辦公用','其他']), '用途'] = '其他'
data['樓別'] = data['樓別/樓高'].str.split('/',expand=True)[0] #樓別/樓高
data['樓高'] = data['樓別/樓高'].str.split('/',expand=True)[1] #樓別/樓高
data.drop(['樓別/樓高'],axis=1,inplace=True)
print('處理前一共有',len(data.樓別.unique()),'不同的資料')
data['樓別'] = data['樓別'].str.split(',', expand=True)[0]
data.loc[data['樓別'].isin(['見其他登記事項','屋頂突出物','夾層','地下','','騎樓']), '樓別'] = '其他'
data['樓別'] = data['樓別'].fillna('NA')

data['樓別'] = data['樓別'].apply(
    lambda x: '地下' if '地下' in x
    else '其他' if x in ['其他', 'NA']
    else '全層' if '全' in x
    else '低層' if x in ['一層', '二層', '三層', '四層', '五層']
    else '中層' if x in ['六層', '七層', '八層', '九層', '十層', '十一層', '十二層', '十三層', '十四層', '十五層']
    else '高層'
)
data.loc[data['樓高'].isin(['(空白)']), '樓高'] = '十五層' # 填補眾數
data['樓高'] = data['樓高'].apply(
lambda x:  '低層' if x in ['一層', '二層', '三層', '四層', '五層']
else '中層' if x in ['六層', '七層', '八層', '九層', '十層', '十一層', '十二層', '十三層', '十四層', '十五層']
else '高層')
data['房'] = data['佈局'].str.split('房',expand=True)[0]
data['廳'] = data['佈局'].str.split('房',expand=True)[1].str.split('廳',expand=True)[0]
data['衛'] = data['佈局'].str.split('房',expand=True)[1].str.split('廳',expand=True)[1].str.split('衛',expand=True)[0]
data.drop('佈局',axis=1,inplace=True)
data['房'] = pd.to_numeric(data['房'].str.extract('(\d+)', expand=False), errors='coerce').astype(float)
data['廳'] = pd.to_numeric(data['廳'].str.extract('(\d+)', expand=False), errors='coerce').astype(float)
data['衛'] = pd.to_numeric(data['衛'].str.extract('(\d+)', expand=False), errors='coerce').astype(float)
# Replace NaN values in '房' column with 0
data['房'].fillna(0, inplace=True)

# Replace NaN values in '廳' column with 0
data['廳'].fillna(0, inplace=True)

# Replace NaN values in '衛' column with 0
data['衛'].fillna(0, inplace=True)

# One-Hot Encoding
data_encoded = pd.get_dummies(data, columns=['建物類型','用途','行政區'])
# Label Encoding
code_dict = {'地下': 0, '其他': 1, '低層': 2,'中層':3,'高層':4,'全層':5}
data_encoded['樓別'] = data_encoded['樓別'].map(code_dict)
code_dict = {'低層': 0, '中層': 1, '高層': 2}
data_encoded['樓高'] = data_encoded['樓高'].map(code_dict)
code_dict = {'無':0, '有':1}
data_encoded['電梯'] = data_encoded['電梯'].map(code_dict)
data_encoded['管理員'] = data_encoded['管理員'].map(code_dict)
data_encoded['電梯'] = data_encoded['電梯'].fillna(0)
data_encoded = data_encoded.drop(['建物類型_其他','用途_其他'],axis=1)
# 先將屋齡是空的和不是空的分開
Age_Na = data_encoded[data_encoded["屋齡"].isnull()]
Age_not_Na = data_encoded[data_encoded["屋齡"].notnull()]
# 找出特徵變數 X 和 目標變數 y
y = Age_not_Na['屋齡']
X = Age_not_Na.drop('屋齡',axis=1)
X.fillna(0, inplace=True)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
rfModel_age = RandomForestRegressor(n_estimators=100,random_state=42)
rfModel_age.fit(X_train, y_train)
# 預測測試集
from sklearn.metrics import r2_score
y_pred =rfModel_age.predict(X_test)
r2 = r2_score(y_test, y_pred)
print('R方得分:', r2)
Age_Na.fillna(0, inplace=True)
#預測屋齡空值
Age_Na.drop('屋齡',axis=1,inplace=True)
Age_Na_value = rfModel_age.predict(Age_Na)
Age_Na['屋齡'] = Age_Na_value
#整合資料
data = pd.concat([Age_not_Na,Age_Na],axis=0)
data.sort_index(inplace=True)
# Replace NaN values in '房' column with 0
data['房'].fillna(0, inplace=True)

# Replace NaN values in '廳' column with 0
data['廳'].fillna(0, inplace=True)

# Replace NaN values in '衛' column with 0
data['衛'].fillna(0, inplace=True)
y = data['價格']
X = data.drop('價格',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
rfModel = RandomForestRegressor(n_estimators=100,random_state=42)
rfModel.fit(X_train, y_train)

y_pred =rfModel.predict(X_test)
r2 = r2_score(y_test, y_pred)
print('R方得分:', r2)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE: ",round(rmse/10000),'萬')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['建物類型'].fillna('住宅大樓(11層含以上有電梯)',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['用途'].fillna('住家用',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['佈局'],inplace=True)


處理前一共有 314 不同的資料
R方得分: 0.7783366528478761


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Age_Na.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Age_Na.drop('屋齡',axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Age_Na['屋齡'] = Age_Na_value


R方得分: 0.8171055241154878
RMSE:  446 萬


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 載入資料及切割資料集
y = data['價格']
X = data.drop('價格', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 定義隨機森林模型
rfModel = RandomForestRegressor()

# 定義要調整的參數範圍
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 使用GridSearchCV進行交叉驗證和網格搜尋
grid_search = GridSearchCV(estimator=rfModel, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# 印出最佳的參數組合
print("最佳參數組合:", grid_search.best_params_)

# 使用最佳參數重新建立模型
best_rfModel = grid_search.best_estimator_

# 使用最佳模型進行預測
y_pred = best_rfModel.predict(X_test)

# 計算 R方得分
r2 = r2_score(y_test, y_pred)
print('最佳模型的R方得分:', r2)


最佳參數組合: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
最佳模型的R方得分: 0.8190383506970614
