In [4]:
import numpy as np
import pandas as pd
import datetime
import time

import lightgbm as lgb


from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score

import itertools

import warnings
import json

warnings.filterwarnings('ignore')


df = pd.read_csv(r'./train_data.csv')
df.head(5)

Unnamed: 0,企业编号,企业总评分,软著数量,作品著作数量,项目数量,纳税A级年份_2014,纳税A级年份_2015,纳税A级年份_2016,纳税A级年份_2017,纳税A级年份增长1,...,应收账款周转天数(天)_mean,应收账款周转天数(天)_max,应收账款周转天数(天)_min,应收账款周转天数(天)_std,应收账款周转天数(天)滚动增长_mean,存货周转天数(天)_mean,存货周转天数(天)_max,存货周转天数(天)_min,存货周转天数(天)_std,存货周转天数(天)滚动增长_mean
0,1001,75.374276,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,107.58927,191.707773,63.791689,44.495607,0.151392,414.778035,1089.655763,176.283983,325.371499,1.562757
1,1002,79.830122,2.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0,...,46.903333,56.59,39.83,6.234116,0.023916,6.506667,7.04,5.01,0.702335,0.04533
2,1003,78.318264,2.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,84.275556,139.91,56.02,33.143654,-0.040224,54.918889,75.54,38.01,11.089465,0.031792
3,1004,83.253376,0.0,6.0,1.0,0.0,0.0,2.0,1.0,0.0,...,26.72,35.36,17.29,6.024438,0.081857,6.954444,7.9,6.24,0.618448,0.021711
4,1005,83.291493,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,94.05,110.26,77.85,9.652235,0.012921,108.584444,357.19,44.16,101.728838,0.344086


# Train data

In [3]:
y = df[['企业编号', '企业总评分']]
x = df.drop(['企业总评分'], axis=1)

xtrain, xtest, ytrain, ytest = train_test_split(
    x, y, test_size=0.2, random_state=31)
ytrain_id = ytrain['企业编号']
ytrain = ytrain['企业总评分']
ytest_id = ytest['企业编号']
ytest = ytest['企业总评分']
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

id_train = xtrain['企业编号']
id_test = xtest['企业编号']
xtrain.drop(['企业编号'], axis=1, inplace=True)
xtest.drop(['企业编号'], axis=1, inplace=True)
feature_name=xtrain.columns.values

(2364, 335) (592, 335) (2364,) (592,)


In [4]:
estimator = lgb.LGBMRegressor(objective='regression',
                              num_leaves=20,
                              learning_rate=0.005,
                              n_estimators=1500,
                              max_depth=7,
                              boosting='gbdt',
                              metric='rmse',
                              max_bin=35,
                              n_jobs=-1,
                              min_child_samples=100,
                              bagging_seed=11,
                              bagging_fraction=0.7,
                              bagging_freq=1,
                              min_data_in_leaf=20
                             )

# RFE

In [0]:
for i in range(334,120, -5):
    selector = RFE(estimator, i, step=1)
    selector = selector.fit(xtrain, ytrain)
    pre=selector.predict(xtest)
    print(i, np.sqrt(mean_squared_error(pre,ytest)))
# The best is 124

In [5]:
selector = RFE(estimator, 124, step=1)
selector = selector.fit(xtrain, ytrain)
pre=selector.predict(xtest)
print(np.sqrt(mean_squared_error(pre,ytest)), r2_score(pre,ytest))

3.0834744809212484 0.4290226823986045


# Test data

In [5]:
test_df = pd.read_csv(r'./test_data.csv')
X_test = test_df.drop(['企业编号'], axis=1)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test = X_test.fillna(0)
test_id = test_df['企业编号']

In [8]:
test_df.head(5)

Unnamed: 0,企业编号,软著数量,作品著作数量,项目数量,纳税A级年份_2014,纳税A级年份_2015,纳税A级年份_2016,纳税A级年份_2017,纳税A级年份增长1,纳税A级年份增长2,...,应收账款周转天数(天)_mean,应收账款周转天数(天)_max,应收账款周转天数(天)_min,应收账款周转天数(天)_std,应收账款周转天数(天)滚动增长_mean,存货周转天数(天)_mean,存货周转天数(天)_max,存货周转天数(天)_min,存货周转天数(天)_std,存货周转天数(天)滚动增长_mean
0,4001,188.0,12.0,1.0,1.0,1.0,2.0,0.0,0.0,1.0,...,-0.053552,853.253333,1078.28,647.43,132.634622,0.012903,0.0,0.0,0.0,0.0
1,4002,42.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,-0.053116,100.236667,160.28,69.5,31.849951,-0.045968,0.0,0.0,0.0,0.0
2,4003,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,...,0.023584,45.646111,63.87,25.49,11.654292,0.031598,0.0,0.0,0.0,0.0
3,4004,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.012479,1.691111,2.17,1.45,0.212394,0.048454,0.0,0.0,0.0,0.0
4,4005,19.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,-1.0,...,-0.097558,616.92,904.24,329.91,191.017424,-0.054086,0.0,0.0,0.0,0.0


In [2]:
pre=selector.predict(X_test)

NameError: name 'selector' is not defined

In [8]:
result = pd.DataFrame({
    '企业编号': test_id,
    'pre_rating': pd.Series(pre)
})

In [9]:
result.to_excel(r'./赛题1结果_Trainee.xlsx', index=False,header=False)