# 对TF-IDF降维处理

## 导包与读入数据

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
import jieba.posseg as pseg

import jieba
import re
from sklearn.model_selection import StratifiedKFold,KFold
import random

import fasttext
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn import linear_model

In [3]:
# 读取训练和预测数据
data_path = 'data/train_second.csv'
df = pd.read_csv(data_path,header = 0)

df2 = pd.read_csv('data/train_first.csv', header = 0)

df = pd.concat([df], ignore_index=True)

test_data_path = 'data/predict_second.csv'
test_df = pd.read_csv(test_data_path,header = 0)

In [33]:
df=df[:5000]
test_df=test_df[:300]

In [81]:
df["Score"].value_counts()

5    2931
4    1412
3     573
2      51
1      30
Name: Score, dtype: int64

# 数据清洗

In [34]:
# 训练集去重
df.drop_duplicates(subset='Discuss', keep='last',inplace=True)
len(df)

4997

In [35]:
# 加载停用词
stop_word = []
# stop_words_path = 'dict/stopWordList.txt'

# with open(stop_words_path,encoding='utf8') as f:
#     for line in f.readlines():
#         stop_word.append(line.strip())
stop_word.append(' ')
# 加载情感词
# dict_path = 'dict/dict.dat'
# jieba.load_userdict(dict_path)
# 字符串清洗，去除停用词
def clean_str(stri):    
    stri = re.sub(u'[\s]+|[^\u4e00-\u9fa5A-Za-z]+|<br />','',stri)
#     stri = re.sub(r'<br />|[\s+\.\!\/_\-,$%^*(+\"\']+|[+—【】！，。？、～~@#￥%……&*（）]|[0-9]+', ' ', stri)  # 正则替换

    cut_str = jieba.cut(stri.strip())
    list_str = [word for word in cut_str if word not in stop_word]
    stri = ' '.join(list_str)
    return stri

df['Discuss'] = df['Discuss'].map(lambda x : clean_str(x))
test_df['Discuss'] = test_df['Discuss'].map(lambda x : clean_str(x))

In [36]:
# 空白的处理方式
def fillnull(x):
    if x == '':
        return '_na_'
    else:
        return x


df['Discuss'] = df['Discuss'].map(lambda x: fillnull(x))
test_df['Discuss'] = test_df['Discuss'].map(lambda x: fillnull(x))


# 辅助函数，评测函数


In [38]:
# 构造fasttext使用的文本
def fasttext_data(data,label):
    fasttext_data = []
    for i in range(len(label)):
        sent = data[i]+"\t__label__"+str(int(label[i]))
        fasttext_data.append(sent)
    with open('train.txt','w') as f:
        for data in fasttext_data:
            f.write(data)
            f.write('\n')
    return 'train.txt'

# 得到预测值
def get_predict(pred):
    score = np.array([1,2,3,4,5])
    pred2 = []
    for p in pred:
        pr = np.sum(p * score)
        pred2.append(pr)
    return np.array(pred2)

# 评测函数
def rmsel(true_label,pred):
    true_label = np.array(true_label)
    pred = np.array(pred)
    n = len(true_label)
    a = true_label - pred
    rmse = np.sqrt(np.sum(a * a)/n)
    b = 1/(1+rmse)
    return b

# 交叉验证函数

In [39]:
# 朴素贝叶斯、逻辑回归模型融合
def lrnb_cv(model1, model2, model3, df, test_df, train_merge):
    df = df.sample(frac=1)  # 对行做shuffle
    df = df.reset_index(drop=True)
    #     # tf-idf向量,目前min_df=1效果最好
    #     vec = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.8,use_idf=1, smooth_idf=1, sublinear_tf=1)
    #     trn_term_doc = vec.fit_transform(df['Discuss'])
    #     test_term_doc = vec.transform(test_df['Discuss'])
    #     # tf-idf降维
    #     tsvd = TruncatedSVD(n_components = 180)
    #     trn_term_doc = tsvd.fit_transform(trn_term_doc)
    #     test_term_doc = tvsd.transform(test_term_doc)

    # 取出模型，lr_model和nb_model
    nb_model = model1
    lr_model = model2
    ri_model = model3
    X = trn_term_doc_scale
    y = df['Score'].values
    lr_pred, nb_pred, ri_pred = [], [], []
    folds = list(KFold(n_splits=5, shuffle=True, random_state=2018).split(X, y))

    for train_index, test_index in folds:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # 朴素贝叶斯训练
        nb_model.fit(X_train, y_train)
        pred_i = nb_model.predict_proba(X_test)
        pred_i = get_predict(pred_i)
        print('nb cv:', rmsel(y_test, pred_i))
        train_merge.loc[test_index, 'nb'] = pred_i  # 将验证集nb预测值进行存储
        train_merge.loc[test_index, 'score1'] = y_test  # 将验证集实际结果进行存储
        # 逻辑回归训练
        lr_model.fit(X_train, y_train)
        pred_i = lr_model.predict_proba(X_test)
        pred_i = get_predict(pred_i)
        print('lr cv:', rmsel(y_test, pred_i))
        train_merge.loc[test_index, 'lr'] = pred_i  # 将验证集lr预测值进行存储
        train_merge.loc[test_index, 'score2'] = y_test  # 将验证集实际结果进行存储
        # 岭回归训练
        ri_model.fit(X_train, y_train)
        pred_i = ri_model.predict(X_test)
        print('ri cv:', rmsel(y_test, pred_i))
        train_merge.loc[test_index, 'ri'] = pred_i  # 将验证集ridge预测值进行存储
        train_merge.loc[test_index, 'score4'] = y_test  # 将验证集实际结果进行存储

        # 朴素贝叶斯预测
        nb_predi = nb_model.predict_proba(test_term_doc_scale)
        nb_predi = get_predict(nb_predi)
        nb_pred.append(nb_predi)
        # 逻辑回归预测
        lr_predi = lr_model.predict_proba(test_term_doc_scale)
        lr_predi = get_predict(lr_predi)
        lr_pred.append(lr_predi)
        # 岭回归预测
        ri_predi = ri_model.predict(test_term_doc_scale)
        ri_pred.append(ri_predi)
    nb_pred = np.array(nb_pred)
    nb_pred = np.mean(nb_pred, axis=0)
    lr_pred = np.array(lr_pred)
    lr_pred = np.mean(lr_pred, axis=0)
    ri_pred = np.array(ri_pred)
    ri_pred = np.mean(ri_pred, axis=0)
    return nb_pred, lr_pred, ri_pred  # 返回三个模型预测结果

In [40]:
# fasttext模型
def fast_cv(df, test_df, train_merge):
    #     df = df.sample(frac=1,random_state=2018)  # 对行做shuffle
    #     df = df.reset_index(drop=True)
    fast_pred = []
    folds = list(KFold(n_splits=5, shuffle=True, random_state=2018).split(X, y))
    rmsels = []
    for train_index, test_index in folds:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        train_file = fasttext_data(X_train, y_train)
        # fasttext训练
        classifier = fasttext.supervised(train_file, 'model.model', lr=0.08, dim=256, word_ngrams=3, bucket=200000,
                                         loss='hs', label_prefix="__label__")
        print(test_index)
        print(df.head())
        # print(df.loc[test_index,'Discuss'].tolist())

        result = classifier.predict_proba(df.loc[test_index, 'Discuss'].astype("str"), k=5)
        pred = [[int(sco) * proba for sco, proba in result_i] for result_i in result]
        pred = [sum(pred_i) for pred_i in pred]
        print('fast cv:', rmsel(y_test, pred))
        train_merge.loc[test_index, 'fast'] = pred  # 将验证集fasttext预测值进行存储
        train_merge.loc[test_index, 'score3'] = y_test  # 将验证集实际结果进行存储
        # fasttext预测
        test_result = classifier.predict_proba(test_df['Discuss'].astype("str"), k=5)
        fast_predi = [[int(sco) * proba for sco, proba in result_i] for result_i in test_result]
        fast_predi = [sum(pred_i) for pred_i in fast_predi]
        fast_pred.append(fast_predi)

    fast_pred = np.array(fast_pred)
    fast_pred = np.mean(fast_pred, axis=0)
    return fast_pred  # 返回fasttext模型预测结果

# TF-IDF

In [41]:
from sklearn.decomposition import TruncatedSVD
vec = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.8,use_idf=1, smooth_idf=1, sublinear_tf=1)
trn_term_doc = vec.fit_transform(df['Discuss'])
test_term_doc = vec.transform(test_df['Discuss'])
# print(type(test_term_doc))
# tf-idf降维
tsvd = TruncatedSVD(n_components = 180)
tsvd.fit(trn_term_doc)
trn_term_doc = tsvd.transform(trn_term_doc)
test_term_doc = tsvd.transform(test_term_doc)

In [47]:
trn_term_doc.shape  , test_term_doc.shape

((4997, 180), (300, 180))

# 融模型

In [42]:
nb_model = MultinomialNB()  # 朴素贝叶斯回归
lr_model = LogisticRegression(C=10, class_weight='balanced')  # 逻辑回归模型
ri_model = linear_model.Ridge() # 岭回归模型


In [43]:
from sklearn import preprocessing
min_max = preprocessing.MinMaxScaler(feature_range=(0, 1))
trn_term_doc_scale = min_max.fit_transform(trn_term_doc)
test_term_doc_scale = min_max.transform(test_term_doc)

In [48]:
data = np.zeros((len(df), 8))
train_merge = pd.DataFrame(data)
train_merge.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
train_merge.columns = ['nb','lr','fast','ri','score1','score2','score3','score4']
nb_pred, lr_pred ,ri_pred= lrnb_cv(nb_model, lr_model, ri_model, df, test_df, train_merge)

nb cv: 0.566480254116
lr cv: 0.473736244226
ri cv: 0.564683902382
nb cv: 0.565698996004
lr cv: 0.481258709892
ri cv: 0.560415370501
nb cv: 0.559393781702
lr cv: 0.479936337103
ri cv: 0.553115318739
nb cv: 0.555822924312
lr cv: 0.476589852318
ri cv: 0.550552207623
nb cv: 0.561864620057
lr cv: 0.488451397484
ri cv: 0.559530351953


In [54]:
nb_pred[:2],len(nb_pred)

(array([ 4.43519898,  4.43459246]), 300)

In [50]:
train_merge.head()

Unnamed: 0,nb,lr,fast,ri,score1,score2,score3,score4
0,4.439559,4.107133,0.0,4.445634,3.0,3.0,0.0,3.0
1,4.437535,4.405524,0.0,4.635245,3.0,3.0,0.0,3.0
2,4.439322,3.822257,0.0,4.488997,5.0,5.0,0.0,5.0
3,4.425786,3.405538,0.0,4.396246,5.0,5.0,0.0,5.0
4,4.433588,2.887204,0.0,4.270663,5.0,5.0,0.0,5.0


In [55]:
# fasttext模型
X = df['Discuss'].values
y = df['Score'].values
fast_pred = fast_cv(df, test_df, train_merge)

[   3    5    9   11   15   20   31   34   37   38   49   54   62   63   64
   66   71   73   83   89   91  116  119  120  124  130  132  133  139  140
  143  144  149  150  151  163  164  165  176  181  186  188  191  194  196
  197  198  202  203  206  213  221  230  234  236  237  245  248  254  267
  272  276  282  285  288  291  310  314  319  320  326  339  345  349  354
  357  361  398  403  405  416  418  422  424  436  439  445  446  450  457
  461  463  465  474  481  484  486  499  500  503  504  508  513  524  535
  539  544  546  553  563  567  568  575  581  584  585  594  603  606  608
  609  615  618  619  620  621  624  628  637  640  644  651  655  656  660
  664  665  673  680  682  684  693  694  703  705  706  709  712  727  731
  733  745  748  750  757  759  760  762  764  773  778  779  780  785  795
  798  815  818  821  833  838  843  846  854  856  864  865  867  871  872
  873  875  877  885  887  888  891  892  896  904  908  911  921  922  928
  929  930  

In [56]:
train_merge.head()

Unnamed: 0,nb,lr,fast,ri,score1,score2,score3,score4
0,4.439559,4.107133,4.190654,4.445634,3.0,3.0,3.0,3.0
1,4.437535,4.405524,4.337659,4.635245,3.0,3.0,4.0,3.0
2,4.439322,3.822257,4.332663,4.488997,5.0,5.0,4.0,5.0
3,4.425786,3.405538,4.273045,4.396246,5.0,5.0,3.0,5.0
4,4.433588,2.887204,4.988243,4.270663,5.0,5.0,5.0,5.0


In [57]:
# 创建测试集
data = np.zeros((len(test_df), 4))
test = pd.DataFrame(data)
feature_columns=['nb','lr', 'fast','ri']
test.columns = ['nb','lr', 'fast','ri']
test['nb'], test['lr'], test['fast'], test['ri'] =  nb_pred, lr_pred, fast_pred, ri_pred
test.describe()

Unnamed: 0,nb,lr,fast,ri
count,300.0,300.0,300.0,300.0
mean,4.435454,3.924515,4.323929,4.446685
std,0.00247,0.377445,0.217749,0.145129
min,4.424206,2.678181,4.080437,4.001293
25%,4.434294,3.674135,4.173414,4.377254
50%,4.435955,3.967249,4.245322,4.437001
75%,4.436714,4.190336,4.416258,4.517938
max,4.442868,4.703269,4.98824,4.931377


In [58]:
test.head()

Unnamed: 0,nb,lr,fast,ri
0,4.435199,4.521176,4.172694,4.753014
1,4.434592,3.905061,4.135437,4.310829
2,4.437468,4.246022,4.187285,4.52886
3,4.436125,3.941207,4.110252,4.420289
4,4.431297,3.865846,4.135437,4.191055


# xgb调参

In [59]:
from sklearn.ensemble import GradientBoostingRegressor  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

In [60]:
# 数据准备
feature_columns=['nb','lr', 'fast', 'ri']
X = train_merge[feature_columns].values
y = train_merge['score1'].values


In [69]:
def modelfit(alg, X, y, useTrainCV=True, early_stopping_rounds=100, cv_folds=5, printFeatureImportance=True):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X, y)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='rmse', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
        #         print(cvresult)
        print(cvresult.shape[0])




In [70]:
# 得到学习熟虑为0.1时的理想决策树目
import xgboost as xgb
xgb1 = xgb.XGBRegressor(learning_rate=0.1,n_estimators=1000, max_depth=5, min_child_weight=1,
                        gamma=0, subsample=0.8,
                      objective='reg:linear',scale_pos_weight=1, seed=2018)
modelfit(xgb1, X, y)

51


In [71]:
from sklearn.metrics import fbeta_score, make_scorer
score = make_scorer(rmsel)
params_test1 = {'max_depth': list(range(3,8,2)), 'min_child_weight': list(range(1,6,2))}
xgb2 = xgb.XGBRegressor(learning_rate=0.1,n_estimators=110, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
                      objective='reg:linear', scale_pos_weight=1, seed=2018)
gsearch1 = GridSearchCV(estimator=xgb2, param_grid=params_test1, scoring=score, cv=5)
gsearch1.fit(X, y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.55868, std: 0.00668, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.55881, std: 0.00674, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.55866, std: 0.00668, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.55659, std: 0.00671, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.55683, std: 0.00618, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.55674, std: 0.00665, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.55294, std: 0.00578, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.55308, std: 0.00505, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.55322, std: 0.00567, params: {'max_depth': 7, 'min_child_weight': 5}],
 {'max_depth': 3, 'min_child_weight': 3},
 0.5588070625945294)

In [72]:
import xgboost as xgb

bst = xgb.XGBRegressor(learning_rate=0.01, n_estimators=1100, max_depth=5, min_child_weight=5, gamma=0, subsample=0.8,
                       scale_pos_weight=1, seed=2018)
xgb_pred = []
folds = list(KFold(n_splits=5, shuffle=True, random_state=2018).split(X, y))
es = []
for tr_index, te_index in folds:
    X_train, X_test = X[tr_index], X[te_index]
    y_train, y_test = y[tr_index], y[te_index]
    bst.fit(X_train, y_train)
    y_pred = bst.predict(X_test)
    e = rmsel(y_test, y_pred)
    print(e)

    test_pred = bst.predict(test[feature_columns].values)
    xgb_pred.append(test_pred)
    es.append(e)
print(np.mean(es, axis=0))

xgb_pred = np.array(xgb_pred)
xgb_pred = np.mean(xgb_pred, axis=0)

0.562370130555
0.563693772737
0.55465280922
0.551119782619
0.557752493859
0.557917797798


# GDBT调参

In [73]:
gdbt_model = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1500, min_samples_split=1000,
                                       min_samples_leaf=30, max_depth=5, max_features='auto', subsample=0.8,
                                       random_state=2018)

gdbt_pred = []
folds = list(KFold(n_splits=5, shuffle=True, random_state=2018).split(X, y))
es = []
for tr_index, te_index in folds:
    X_train, X_test = X[tr_index], X[te_index]
    y_train, y_test = y[tr_index], y[te_index]
    gdbt_model.fit(X_train, y_train)
    y_pred = gdbt_model.predict(X_test)
    e = rmsel(y_test, y_pred)
    print(e)

    test_pred = gdbt_model.predict(test[feature_columns].values)
    gdbt_pred.append(test_pred)
    es.append(e)
print(np.mean(es, axis=0))
gdbt_pred = np.array(gdbt_pred)
gdbt_pred = np.mean(gdbt_pred, axis=0)


0.562043308224
0.56366935788
0.555325104238
0.552826753101
0.558333125108
0.55843952971


# 输出结果

In [77]:
min(xgb_pred),  max(xgb_pred)

(4.0689497, 4.6754389)

In [74]:
np.percentile(xgb_pred, 0.01),
np.percentile(xgb_pred, 0.015),
np.percentile(xgb_pred, 5),
np.percentile(xgb_pred, 20),
np.percentile(xgb_pred, 30),
np.percentile(xgb_pred, 62),
np.percentile(xgb_pred, 70)

(4.0697497123718263,
 4.0701497188568112,
 4.2598052501678474,
 4.3610227584838865,
 4.3971321582794189,
 4.4640867614746096,
 4.4806831359863279)

In [32]:
xgb_pred2 = xgb_pred
# xgb_pred1 = np.where(xgb_pred1<1.72, 1,xgb_pred1)
xgb_pred2 = np.where((xgb_pred2>1.305)&(xgb_pred2<1.28), 2, xgb_pred2)
xgb_pred2 = np.where((xgb_pred2<3.3)&(xgb_pred2>1.99), 3, xgb_pred2)
xgb_pred2 = np.where((xgb_pred2<4.1)&(xgb_pred2>3.9), 4, xgb_pred2)
xgb_pred2 = np.where(xgb_pred2>4.61, 5, xgb_pred2)

In [None]:
test['Id'] = test_df['Id']
test[['Id', 'xgb_merge2']].to_csv('result/0326-3.csv',index=None,header =None)