In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import jieba
import jieba.analyse
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import scipy
from sklearn.model_selection import KFold,StratifiedKFold
from scipy.sparse import csr_matrix, hstack
import re
import numpy as np

In [28]:
stop_word = []
stop_words_path = '../input/stop_word.txt'
with open(stop_words_path,encoding='utf-8') as f:
    for line in f.readlines():
        stop_word.append(line.strip())
stop_word.append(' ')

def clean_str(stri):
    stri = re.sub(r'[a-zA-Z0-9]+','',stri)
    cut_str = jieba.cut(stri.strip())
    list_str = [word for word in cut_str if word not in stop_word]
    return ' '.join(list_str)

def rmsel(true_label,pred):
    rmse = np.sqrt(mean_squared_error(true_label, pred))
    return 1 / (1 + rmse)

In [29]:
def get_data():
    train = pd.read_csv('../input/train_first.csv')
    test = pd.read_csv('../input/predict_first.csv')
    data = pd.concat([train, test])
    print('train %s test %s'%(train.shape,test.shape))
    print('train columns',train.columns)
    return data,train.shape[0],train['Score'],test['Id']

In [30]:
def split_discuss(data):
    data['length'] = data['Discuss'].apply(lambda x:len(x))
    data['Discuss'] = data['Discuss'].apply(lambda x:clean_str(x))
    return data

In [31]:
def pre_process():
    data,nrw_train,y,test_id = get_data()
    data = split_discuss(data)
    cv = CountVectorizer(ngram_range=(1,2))
    discuss = cv.fit_transform(data['Discuss'])
    tf = TfidfVectorizer(max_df=10000,ngram_range=(1,2))
    discuss_tf = tf.fit_transform(data['Discuss'])
    data = hstack((discuss,discuss_tf)).tocsr()
    return data[:nrw_train],data[nrw_train:],y,test_id

In [32]:
X,test,y,test_id = pre_process()

train (100000, 3) test (30000, 2)
train columns Index(['Id', 'Discuss', 'Score'], dtype='object')


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [36]:
X.shape

(100000, 1971262)

In [50]:
def training(X, y, T):
    nfolds = 5
    folds = list(StratifiedKFold(n_splits=nfolds, random_state=2018, shuffle=True).split(X, y))
    error = []

    S_train = np.zeros((X.shape[0], 1))  # 训练样本数 * 模型个数
    S_test = np.zeros((T.shape[0], 1))  # 测试集样本数 * 模型个数
    S_test_n = np.zeros((T.shape[0], len(folds)))  # 测试集样本数 * n_folds

    model = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=250, normalize=False, tol=0.01)
    for j, (train_fold, test_fold) in enumerate(folds):
        X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold]
        model.fit(X_train, label_train)

        val_ = model.predict(X=X_validate)
        pred_ = model.predict(X=T)
        rmse_ = rmsel(val_, label_validate)
        print(rmse_)
        error.append(rmse_)

        S_train[test_fold] = np.array(val_).reshape(-1, 1)
        S_test_n[:, j] = np.array(pred_)

    S_test[:] = S_test_n.mean(1).reshape(-1, 1)
    return S_train, S_test, round(np.mean(error), 5)

In [51]:
S_train, S_test, error = training(X, y, test)

0.5776183727627812
0.5816785658687482
0.5776895445831461
0.5779018636186908
0.5774038795108709


In [53]:
train_df = pd.read_csv('../input/train_first.csv')
predict_df = pd.read_csv('../input/predict_first.csv')

train_out = train_df[['Id']]
train_out['ridge_doufu'] = S_train
train_out.to_csv('../models/__models__/train_ridge_doufu.csv', index = False)

test_out = predict_df[['Id']]
test_out['ridge_doufu'] = S_test
test_out.to_csv('../models/__models__/test_ridge_doufu.csv', index = False)
print(error)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.57846
