In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_excel('./Data/datapro1.xlsx')

In [4]:
# 1. 选择符合条件的数据点
df1 = df[df['专利价值'] < 5000]
df2=df[df['专利价值'] > 5000]
# 2. 从符合条件的数据点中随机选择，这里选择 10 个数据点作为示例
df3 = df1.sample(n=10000, random_state=random.seed(42))
df = pd.concat([df2, df3], axis=0, ignore_index=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19065 entries, 0 to 19064
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   标题          19065 non-null  object 
 1   产业链位置       19065 non-null  object 
 2   一级技术分支      19065 non-null  object 
 3   专利类型        19065 non-null  object 
 4   公开国别        19065 non-null  object 
 5   公开(公告)日     19065 non-null  int64  
 6   权利要求数量      19065 non-null  int64  
 7   文献页数        19065 non-null  int64  
 8   申请人省市代码     19065 non-null  object 
 9   3年内被引用次数    19065 non-null  int64  
 10  5年内被引用次数    19065 non-null  int64  
 11  引用专利数量      19065 non-null  int64  
 12  被引用专利数量     19065 non-null  int64  
 13  专利有效性       19065 non-null  object 
 14  诉讼案件数       19065 non-null  int64  
 15  专利价值        19065 non-null  int64  
 16  专利价值（对数变换）  19065 non-null  float64
dtypes: float64(1), int64(9), object(7)
memory usage: 2.5+ MB


In [6]:
cat_columns = [
 '产业链位置',
 '一级技术分支',
 '专利类型',
 '申请人省市代码',
 '专利有效性',
"公开(公告)日"
]

In [7]:
ohEncoder = OneHotEncoder(drop='first')
cat_features = ohEncoder.fit_transform(df[cat_columns]).toarray()
cat_features

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 1., 0., 0.]])

In [8]:
num_columns = [ 
 '3年内被引用次数',
 '5年内被引用次数',
 '引用专利数量',
 '被引用专利数量',
 '权利要求数量',
 '文献页数',
 '诉讼案件数'
]

In [9]:
stdScaler = StandardScaler()
num_features = stdScaler.fit_transform(df[num_columns])
num_features

array([[-0.30455925, -0.30010572,  0.96333486, ...,  0.41077646,
         0.39072592, -0.02921079],
       [-0.30455925, -0.30010572,  3.27756395, ..., -1.7237992 ,
         0.17595283, -0.02921079],
       [-0.30455925, -0.30010572,  0.44906173, ...,  1.02065522,
         0.605499  , -0.02921079],
       ...,
       [-0.30455925, -0.30010572, -0.57948453, ...,  0.71571584,
         1.46459133, -0.02921079],
       [-0.30455925, -0.30010572, -0.83662109, ...,  1.02065522,
        -0.03882025, -0.02921079],
       [-0.30455925, -0.30010572, -0.83662109, ..., -0.50404168,
        -0.46836642, -0.02921079]])

In [10]:
X = np.hstack([cat_features, num_features])
X.shape

(19065, 71)

In [11]:
y = df['专利价值'].to_numpy()
y.shape

(19065,)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3)


In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13345, 71), (5720, 71), (13345,), (5720,))

In [14]:
from sklearn.linear_model import LinearRegression
LR =  LinearRegression()
LR.fit(X_train, y_train)

In [15]:
y_pred_lr = LR.predict(X_test)

In [16]:
from sklearn.linear_model import SGDRegressor
SGDR =  SGDRegressor()
SGDR.fit(X_train, y_train)

In [17]:
y_pred_sgdr = SGDR.predict(X_test)
y_pred_sgdr

array([-24143.43519055,  10632.18817862,  90029.2429933 , ...,
        80541.77428264,  -4400.16684551,  29241.33440162])

In [18]:
from sklearn.linear_model import ElasticNet
EN = ElasticNet()
EN.fit(X_train, y_train)

In [19]:
y_pred_en = EN.predict(X_test)
y_pred_en

array([13641.45084182, 59381.95118794, 91178.46482643, ...,
       65821.11595452, 35765.83375779, 31822.90729049])

In [20]:
from sklearn.linear_model import BayesianRidge
BR = BayesianRidge()
BR.fit(X_train, y_train)

In [21]:
y_pred_br = BR.predict(X_test)
y_pred_br

array([-22209.66493918,  15357.40469739,  86652.60623017, ...,
        80870.4191716 ,  -1779.91675652,  30227.33299187])

In [22]:
from lightgbm import LGBMRegressor
LGBMR = LGBMRegressor(force_row_wise=True)
LGBMR.fit(X_train, y_train)

[LightGBM] [Info] Total Bins 271
[LightGBM] [Info] Number of data points in the train set: 13345, number of used features: 64
[LightGBM] [Info] Start training from score 67070.760584


In [23]:
y_pred_lgbmr = LGBMR.predict(X_test)
y_pred_lgbmr

array([ 2868.35947926,  5138.80451399, 66157.25792159, ...,
       91516.81944637,  4642.15180738,  2501.23130174])

In [24]:
from xgboost.sklearn import XGBRegressor
XGBR=XGBRegressor()
XGBR.fit(X_train, y_train)

In [25]:
y_pred_xgbr = XGBR.predict(X_test)
y_pred_xgbr

array([ 4738.144 ,  3154.084 , 60340.223 , ..., 96617.586 ,  3203.3738,
       -3168.8682], dtype=float32)

In [26]:
from catboost import CatBoostRegressor
CBR = CatBoostRegressor()
CBR.fit(X_train, y_train)

Learning rate set to 0.061657
0:	learn: 193830.3054407	total: 140ms	remaining: 2m 19s
1:	learn: 186523.4223629	total: 142ms	remaining: 1m 10s
2:	learn: 179718.7550132	total: 144ms	remaining: 47.9s
3:	learn: 173948.5920632	total: 146ms	remaining: 36.3s
4:	learn: 168358.8068144	total: 148ms	remaining: 29.4s
5:	learn: 163616.8930949	total: 150ms	remaining: 24.8s
6:	learn: 159254.1742081	total: 152ms	remaining: 21.5s
7:	learn: 154843.1860820	total: 154ms	remaining: 19s
8:	learn: 150785.3711684	total: 155ms	remaining: 17.1s
9:	learn: 147177.7641317	total: 157ms	remaining: 15.6s
10:	learn: 143946.3329430	total: 159ms	remaining: 14.3s
11:	learn: 141137.3454941	total: 161ms	remaining: 13.3s
12:	learn: 138421.5046631	total: 164ms	remaining: 12.4s
13:	learn: 135845.8991576	total: 166ms	remaining: 11.7s
14:	learn: 133578.5683067	total: 168ms	remaining: 11s
15:	learn: 131226.6999273	total: 170ms	remaining: 10.4s
16:	learn: 129460.6405508	total: 171ms	remaining: 9.91s
17:	learn: 127470.5200686	tota

<catboost.core.CatBoostRegressor at 0x18cf7987f10>

In [27]:
y_pred_cbr = CBR.predict(X_test)
y_pred_cbr

array([ 1.38557229e+03,  4.24669376e+03,  7.77559580e+04, ...,
        9.48295978e+04, -4.09249956e+01, -2.32592875e+02])

In [28]:
# from sklearn.kernel_ridge import KernelRidge
# KR = KernelRidge()
# KR.fit(X_train, y_train)

In [29]:
# y_pred_kr = KR.predict(X_test)

In [30]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(n_estimators=500,random_state=42, n_jobs=-1)
RFR.fit(X_train, y_train)

In [31]:
RFR.score(X_test,y_test)

0.8200956482153396

In [32]:
from sklearn.model_selection import cross_val_score
# 使用交叉验证评估模型性能
cv_scores = cross_val_score(RFR, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

# 计算均方根误差（Root Mean Squared Error，RMSE）的平均值
rmse_scores = (-cv_scores)**0.5
average_rmse = rmse_scores.mean()

In [33]:
average_rmse

97344.43265094126

In [34]:
y_pred_rfr = RFR.predict(X_test)
y_pred_rfr

array([  3380.9       ,   3314.33333333,  51368.8       , ...,
       113874.        ,   3318.69333333,   3356.06666667])

In [35]:
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor()
GBR.fit(X_train, y_train)

In [36]:
y_pred_gbr = GBR.predict(X_test)
y_pred_gbr

array([-3478.31198124,  3221.33992248, 88820.23886405, ...,
       97131.21567288,  1365.02796078, 14170.19250307])

In [37]:
from sklearn.svm import SVR
SVR = SVR()
SVR.fit(X_train, y_train)

In [38]:
y_pred_svr = SVR.predict(X_test)
y_pred_svr

array([3417.51858717, 3604.11403456, 5221.09960067, ..., 4981.37487702,
       3513.31014482, 3607.63425305])

In [39]:
result_df = pd.DataFrame({
    "y_test":y_test,
    "differ_lr":abs(y_pred_lr-y_test),
    "differ_sgdr":abs(y_pred_sgdr-y_test),
    "differ_en":abs(y_pred_en-y_test),
    "differ_br":abs(y_pred_br-y_test),
    "differ_lgbmr":abs(y_pred_lgbmr-y_test),
    "differ_xgbr":abs(y_pred_xgbr-y_test),
    "differ_cbr":abs(y_pred_cbr-y_test),
#     "differ_kr":abs(y_pred_kr-y_test),
    "differ_rfr":abs(y_pred_rfr-y_test),
    "differ_gbr":abs(y_pred_gbr-y_test),
    "differ_svr":abs(y_pred_svr-y_test),
 })

In [40]:
r_df = result_df.describe(percentiles=[0.5,0.6,0.7,0.8,0.9]).transpose()
rcolumns_l = ['mean', 'std', '50%', '60%', '70%', '80%', '90%']

In [41]:
r_df[rcolumns_l].round(decimals=2)

Unnamed: 0,mean,std,50%,60%,70%,80%,90%
y_test,62903.58,180097.3,3500.0,32000.0,62000.0,99000.0,160000.0
differ_lr,52827170000000.0,3253221000000000.0,30336.0,36928.0,46865.2,60241.6,85924.4
differ_sgdr,54902.89,131911.4,30916.52,37581.27,46937.86,60086.65,85617.12
differ_en,61935.71,161271.0,39044.68,45889.8,55646.2,68396.86,91710.38
differ_br,54919.66,131845.1,30448.09,37147.71,47075.28,60882.17,85963.11
differ_lgbmr,32491.99,82022.84,7009.94,18930.11,33310.41,50661.86,82627.7
differ_xgbr,27741.04,76436.16,7333.17,16078.14,28481.81,44743.53,69184.23
differ_cbr,30978.38,81070.24,9237.87,18685.31,32879.52,49047.05,75184.68
differ_rfr,22688.55,72940.67,1733.1,8062.24,17930.08,32334.08,58654.36
differ_gbr,35504.6,92404.51,10466.27,22985.67,39052.6,57290.95,80911.9


In [43]:
result_df.to_excel('./Data/result2.xlsx',index=False)