## 抽取特征

统计类特征

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
train_data_char = pd.read_csv('../data/aux/train_char_indexvec.csv')
train_data_word = pd.read_csv('../data/aux/train_word_indexvec.csv')
train_data_char.columns = ['id', 'question1', 'question2', 'label']
train_data_word.columns = ['id', 'question1', 'question2', 'label']
print(train_data_char.shape)

train_ori_data = pd.read_csv('../data/aux/train_parse.csv', sep='\t', header=None)
train_ori_data.columns = ['id', 'question1', 'question2', 'label']

(98976, 4)


In [9]:
print train_ori_data.head()
print train_data_word.head()

   id        question1                       question2  label
0   1      ﻿怎么更改花呗手机号码  我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号      1
1   2  也开不了花呗，就这样了？完事了                      真的嘛？就是花呗付款      0
2   3      花呗冻结以后还能开通吗                   我的条件可以开通花呗借款吗      0
3   4         如何得知关闭借呗                         想永久关闭借呗      0
4   5           花呗扫码付钱                     二维码扫描可以用花呗吗      0
   id                           question1                                          question2  label
0   1                        11 239 2 213  3 5 2 18 149 5 213 8 11 980 588 40 5 30 5 172 106      1
1   2  103 152 31 2 8 71 599 7 136 3142 7                               819 133 136 211 2 32      0
2   3                2 114 181 10 23 20 9                              3 5 201 13 20 2 120 9      0
3   4                             56 51 4                                        67 560 51 4      0
4   5                           2 319 585                                 212 1033 13 15 2 9      0


In [4]:
y = train_ori_data.iloc[:, 3]
def test_feas(X):
    Xt, Xv, yt, yv = train_test_split(X, y)
    clf = LogisticRegression(class_weight={0: 1.,1: 5,})
    clf.fit(Xt, yt)
    f1 = f1_score(yv, clf.predict(Xv))
    print 'weights: ', f1

### N-grams

In [6]:
def split_string_as_list_by_ngram(input_string,ngram_value):
    input_string="".join([string for string in input_string if string.strip()])
    length = len(input_string)
    result_string=[]
    for i in range(length):
        if i + ngram_value < length + 1:
            result_string.append(input_string[i:i+ngram_value])
    return result_string


def compute_blue_ngram(x1_list,x2_list):
    """
    compute blue score use ngram information. x1_list as predict sentence,x2_list as target sentence
    :param x1_list:
    :param x2_list:
    :return:
    """
    count_dict={}
    count_dict_clip={}
    #1. count for each token at predict sentence side.
    for token in x1_list:
        if token not in count_dict:
            count_dict[token]=1
        else:
            count_dict[token]=count_dict[token]+1
    count=np.sum([value for key,value in count_dict.items()])

    #2.count for tokens existing in predict sentence for target sentence side.
    for token in x2_list:
        if token in count_dict:
            if token not in count_dict_clip:
                count_dict_clip[token]=1
            else:
                count_dict_clip[token]=count_dict_clip[token]+1

    #3. clip value to ceiling value for that token
    count_dict_clip={key:(value if value<=count_dict[key] else count_dict[key]) for key,value in count_dict_clip.items()}
    count_clip=np.sum([value for key,value in count_dict_clip.items()])
    result=float(count_clip)/(float(count)+0.00000001)
    return result


In [7]:
def cal_ngram(csv_data, ngram_value):
    ngram_lt1 = []
    ngram_lt2 = []
    for i in range(csv_data.shape[0]):
        x1_list = csv_data.iloc[i, 1].split(' ')
        x2_list = csv_data.iloc[i, 2].split(' ')
        res1 = compute_blue_ngram(split_string_as_list_by_ngram(x1_list, ngram_value), 
                                  split_string_as_list_by_ngram(x2_list,ngram_value))
        res2 = compute_blue_ngram(split_string_as_list_by_ngram(x2_list, ngram_value), 
                                  split_string_as_list_by_ngram(x1_list,ngram_value))
        ngram_lt1.append(res1)
        ngram_lt2.append(res2)
    return ngram_lt1,ngram_lt2

fea_dict = {}
for ngram in range(1, 9):
    ngram_lt1,ngram_lt2 = cal_ngram(train_data_char, ngram)
    fea_dict['ngram1'+str(ngram)] = ngram_lt1
    fea_dict['ngram2'+str(ngram)] = ngram_lt2
    
save_data_char = pd.DataFrame(fea_dict)
save_data_char.to_csv('../ngram_features_char.csv', index=False)

fea_dict = {}
for ngram in range(1, 9):
    ngram_lt1,ngram_lt2 = cal_ngram(train_data_word, ngram)
    fea_dict['ngram1'+str(ngram)] = ngram_lt1
    fea_dict['ngram2'+str(ngram)] = ngram_lt2
    
save_data_word = pd.DataFrame(fea_dict)
save_data_word.to_csv('../ngram_features_word.csv', index=False)

In [8]:
test_feas(save_data_char)
test_feas(save_data_word)
#test_feas()
combine_feas = pd.concat([save_data_char, save_data_word], axis=1)
test_feas(combine_feas)
combine_feas.corr()

weights:  0.37924297924297923
weights:  0.3337535266222463
weights:  0.37465258476931634


Unnamed: 0,ngram11,ngram12,ngram13,ngram14,ngram15,ngram16,ngram17,ngram18,ngram21,ngram22,ngram23,ngram24,ngram25,ngram26,ngram27,ngram28,ngram11.1,ngram12.1,ngram13.1,ngram14.1,ngram15.1,ngram16.1,ngram17.1,ngram18.1,ngram21.1,ngram22.1,ngram23.1,ngram24.1,ngram25.1,ngram26.1,ngram27.1,ngram28.1
ngram11,1.0,0.779065,0.537676,0.432923,0.35523,0.297301,0.240264,0.208584,-0.583697,-0.399037,-0.185816,-0.105913,-0.052851,-0.021169,0.008396,0.020463,0.747008,0.504818,0.319161,0.228436,0.182614,0.145635,0.110902,0.084278,-0.585551,-0.284581,-0.090015,-0.015892,0.01132,0.025278,0.031009,0.029033
ngram12,0.779065,1.0,0.856481,0.75188,0.663182,0.584745,0.498815,0.444421,-0.398241,0.086422,0.275733,0.310127,0.320415,0.311363,0.294594,0.275611,0.638008,0.639218,0.543497,0.444672,0.378304,0.312699,0.246075,0.19436,-0.358341,0.028699,0.212071,0.237728,0.227962,0.204694,0.17393,0.143534
ngram13,0.537676,0.856481,1.0,0.951748,0.878298,0.796535,0.691158,0.620564,-0.184841,0.276217,0.59533,0.627975,0.615297,0.57843,0.521473,0.476293,0.467732,0.659145,0.688804,0.599281,0.520568,0.431504,0.341368,0.271136,-0.147329,0.25963,0.449512,0.437448,0.396486,0.339467,0.278754,0.22618
ngram14,0.432923,0.75188,0.951748,1.0,0.966365,0.901253,0.799729,0.724942,-0.10479,0.311459,0.629358,0.725453,0.732928,0.700569,0.637844,0.584367,0.390908,0.640118,0.739207,0.678397,0.602464,0.504312,0.401746,0.320386,-0.069602,0.328234,0.536447,0.531462,0.484485,0.414394,0.339238,0.274942
ngram15,0.35523,0.663182,0.878298,0.966365,1.0,0.96813,0.885447,0.813086,-0.050995,0.322956,0.618215,0.734604,0.789265,0.778309,0.725335,0.670659,0.328911,0.598983,0.750758,0.729618,0.666562,0.566543,0.455627,0.36548,-0.020449,0.352176,0.575992,0.592553,0.550527,0.475417,0.390776,0.31764
ngram16,0.297301,0.584745,0.796535,0.901253,0.96813,1.0,0.959836,0.899733,-0.015489,0.317167,0.584075,0.704915,0.780976,0.818662,0.796901,0.75042,0.281173,0.552684,0.732936,0.76305,0.724417,0.630067,0.514527,0.416834,0.011017,0.353666,0.58072,0.632146,0.606606,0.534517,0.444673,0.364446
ngram17,0.240264,0.498815,0.691158,0.799729,0.885447,0.959836,1.0,0.96782,0.017299,0.302811,0.528914,0.643909,0.729725,0.798709,0.837916,0.812553,0.233926,0.489433,0.678801,0.765281,0.762472,0.684664,0.571712,0.469438,0.039504,0.338731,0.553848,0.644387,0.644758,0.585256,0.496308,0.411714
ngram18,0.208584,0.444421,0.620564,0.724942,0.813086,0.899733,0.96782,1.0,0.031584,0.286067,0.486023,0.592977,0.677949,0.755623,0.816205,0.839149,0.208012,0.450295,0.63577,0.742784,0.775996,0.724802,0.626176,0.524838,0.051582,0.324581,0.526182,0.63034,0.659374,0.621999,0.543545,0.459839
ngram21,-0.583697,-0.398241,-0.184841,-0.10479,-0.050995,-0.015489,0.017299,0.031584,1.0,0.779095,0.538284,0.43305,0.355342,0.300212,0.246479,0.216714,-0.587573,-0.276336,-0.086629,-0.012057,0.017443,0.031841,0.039265,0.040367,0.745532,0.512542,0.322757,0.232239,0.18771,0.150757,0.117617,0.093677
ngram22,-0.399037,0.086422,0.276217,0.311459,0.322956,0.317167,0.302811,0.286067,0.779095,1.0,0.85639,0.75195,0.663005,0.58722,0.504957,0.452686,-0.361615,0.03387,0.21366,0.240645,0.2331,0.210134,0.180487,0.153431,0.639987,0.645165,0.547885,0.44907,0.383736,0.318714,0.255157,0.205814


In [10]:
ngram_feas = [save_data_char, save_data_word]

### LDA

In [11]:
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
import pickle
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

NUM_TOPICS = 300

RANDOM_SEED = 42

In [12]:
train_data_word.head(5)

Unnamed: 0,id,question1,question2,label
0,1,11 239 2 213,3 5 2 18 149 5 213 8 11 980 588 40 5 30 5 172 106,1
1,2,103 152 31 2 8 71 599 7 136 3142 7,819 133 136 211 2 32,0
2,3,2 114 181 10 23 20 9,3 5 201 13 20 2 120 9,0
3,4,56 51 4,67 560 51 4,0
4,5,2 319 585,212 1033 13 15 2 9,0


In [13]:
def build_model(train_data):
    documents = list(train_data.iloc[:, 1])
    documents.extend(list(train_data.iloc[:, 2]))
    print documents[:10]
    documents = [item.split(' ') for item in documents]
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]
    model = LdaMulticore(
        corpus,
        num_topics=NUM_TOPICS,
        id2word=dictionary,
        random_state=RANDOM_SEED,
    )
    return model, dictionary

In [14]:
def compute_topic_distances(model, dictionary, pair):
    q1_bow = dictionary.doc2bow(pair[0])
    q2_bow = dictionary.doc2bow(pair[1])
    
    q1_topic_vec = np.array(model.get_document_topics(q1_bow, minimum_probability=0))[:, 1].reshape(1, -1)
    q2_topic_vec = np.array(model.get_document_topics(q2_bow, minimum_probability=0))[:, 1].reshape(1, -1)
    
    return [
        cosine_distances(q1_topic_vec, q2_topic_vec)[0][0],
        euclidean_distances(q1_topic_vec, q2_topic_vec)[0][0],
    ]

In [15]:
def cal_lda(csv_data):
    cosine_lt = []
    euclidean_lt = []
    model, dictionary = build_model(csv_data)
    for i in range(csv_data.shape[0]):
        cosine_val, euclidean_val = compute_topic_distances(model, dictionary, (csv_data.iloc[i, 1].split(' '), csv_data.iloc[i, 2].split(' ')))
        cosine_lt.append(cosine_val)
        euclidean_lt.append(euclidean_val)
    return cosine_lt, euclidean_lt

In [23]:
cosine_lt, euclidean_lt = cal_lda(train_data_char)
save_data_char = pd.DataFrame({'cosine_distances':cosine_lt, 'euclidean_distances':euclidean_lt})
save_data.to_csv('../lda_features_char.csv', index=False)

cosine_lt, euclidean_lt = cal_lda(train_data_word)
save_data_word = pd.DataFrame({'cosine_distances':cosine_lt, 'euclidean_distances':euclidean_lt})
save_data.to_csv('../lda_features_word.csv', index=False)

['1476 15 4 184 128 3 2 56 72 43 59', '159 31 13 10 3 2 14 95 65 87 10 199 123 195 10', '3 2 153 130 23 52 5 21 31 36 16', '76 84 258 226 68 94 6 2', '3 2 232 59 25 34', '3 2 70 19 52 13 21 29 19 16', '3 2 29 19 93 681', '6 2 70 19 236 57 36 226', '6 2 568 464 41 19 48 5 9 46 75 5 21 29 19 16', '3 2 26 17 4 13 21 33 25 56 72 143 291']


NameError: name 'save_data' is not defined

In [17]:
test_feas(save_data_char)
test_feas(save_data_word)
combine_feas = pd.concat([save_data_char, save_data_word], axis=1)
test_feas(combine_feas)
combine_feas.corr()

weights:  0.3482502133886111
weights:  0.33651900941750956
weights:  0.365155705182157


Unnamed: 0,cosine_distances,euclidean_distances,cosine_distances.1,euclidean_distances.1
cosine_distances,1.0,0.914287,0.450425,0.422207
euclidean_distances,0.914287,1.0,0.419013,0.460359
cosine_distances,0.450425,0.419013,1.0,0.893685
euclidean_distances,0.422207,0.460359,0.893685,1.0


In [18]:
lda_feas = [save_data_char, save_data_word]

### Simple Summary Statistics

简单的统计类特征

包括问题的最短长度，最长长度，长度差，长度比率，交集并集的比率

In [19]:
def word_difference_ratio(q1_tokens, q2_tokens):
    return 1.0 * len(set(q1_tokens) ^ set(q2_tokens)) / (len(set(q1_tokens)) + len(set(q2_tokens)))

def extract_tokenized_features(pair):
    q1 = pair[0]
    q2 = pair[1]
    
    shorter_token_length = min(len(q1), len(q2))
    longer_token_length = max(len(q1), len(q2))
    
    return [
        np.log(shorter_token_length + 1),
        np.log(longer_token_length + 1),
        np.log(abs(longer_token_length - shorter_token_length) + 1),
        1.0 * shorter_token_length / longer_token_length,
        word_difference_ratio(q1, q2),
    ]
def cal_summary(csv_data):
    short_lt = []
    long_lt = []
    diff_lt = []
    diff_ratio_lt = []
    word_difference_ratio_lt = []
    for i in range(csv_data.shape[0]):
        a1,a2,a3,a4,a5 = extract_tokenized_features((csv_data.iloc[i, 1].split(' '), csv_data.iloc[i, 2].split(' ')))
        short_lt.append(a1)
        long_lt.append(a2)
        diff_lt.append(a3)
        diff_ratio_lt.append(a4)
        word_difference_ratio_lt.append(a5)
    return short_lt, long_lt, diff_lt, diff_ratio_lt, word_difference_ratio_lt

In [None]:
short_lt, long_lt, diff_lt, diff_ratio_lt, word_difference_ratio_lt = cal_summary(train_data_char)
save_data_char = pd.DataFrame({'short_lt':short_lt,
                          'long_lt':long_lt,
                          'diff_lt':diff_lt,
                          'diff_ratio_lt':diff_ratio_lt,
                          'word_difference_ratio_lt':word_difference_ratio_lt})
save_data.to_csv('../simsummary_features_char.csv', index=False)

short_lt, long_lt, diff_lt, diff_ratio_lt, word_difference_ratio_lt = cal_summary(train_data_word)
save_data_word = pd.DataFrame({'short_lt':short_lt,
                          'long_lt':long_lt,
                          'diff_lt':diff_lt,
                          'diff_ratio_lt':diff_ratio_lt,
                          'word_difference_ratio_lt':word_difference_ratio_lt})
save_data.to_csv('../simsummary_features_word.csv', index=False)

In [21]:
test_feas(save_data_char)
test_feas(save_data_word)
combine_feas = pd.concat([save_data_char, save_data_word], axis=1)
test_feas(combine_feas)
combine_feas.corr()

weights:  0.4000280721454137
weights:  0.3464224384659416
weights:  0.40230203855221186


Unnamed: 0,diff_lt,diff_ratio_lt,long_lt,short_lt,word_difference_ratio_lt,diff_lt.1,diff_ratio_lt.1,long_lt.1,short_lt.1,word_difference_ratio_lt.1
diff_lt,1.0,-0.930463,0.687793,0.078501,0.230256,0.787167,-0.725019,0.631898,0.128942,0.181086
diff_ratio_lt,-0.930463,1.0,-0.465584,0.234474,-0.227166,-0.745388,0.800121,-0.423377,0.164557,-0.174538
long_lt,0.687793,-0.465584,1.0,0.743425,0.198276,0.641732,-0.38417,0.94768,0.719269,0.162716
short_lt,0.078501,0.234474,0.743425,1.0,0.041134,0.142547,0.18651,0.713867,0.923279,0.043541
word_difference_ratio_lt,0.230256,-0.227166,0.198276,0.041134,1.0,0.220128,-0.201045,0.205721,0.063945,0.696595
diff_lt,0.787167,-0.745388,0.641732,0.142547,0.220128,1.0,-0.921698,0.711545,0.087698,0.209369
diff_ratio_lt,-0.725019,0.800121,-0.38417,0.18651,-0.201045,-0.921698,1.0,-0.447671,0.266287,-0.205842
long_lt,0.631898,-0.423377,0.94768,0.713867,0.205721,0.711545,-0.447671,1.0,0.731866,0.169752
short_lt,0.128942,0.164557,0.719269,0.923279,0.063945,0.087698,0.266287,0.731866,1.0,0.021379
word_difference_ratio_lt,0.181086,-0.174538,0.162716,0.043541,0.696595,0.209369,-0.205842,0.169752,0.021379,1.0


In [22]:
simsum_feas = [save_data_char, save_data_word]

### TF-IDF Distances

Create TF-IDF vectors from question texts and compute vector distances between them.

In [None]:
#from sklearn.feature_extraction.textsklearn  import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# vectorizer = TfidfVectorizer(
#     encoding='utf-8',
#     analyzer='word',
#     strip_accents='unicode',
#     ngram_range=(1, 1),
#     lowercase=True,
#     norm='l2',
#     use_idf=True,
#     smooth_idf=True,
#     sublinear_tf=True,
# )

def load_tfidf_dict(file_path):
    source_obj = open(file_path, 'r')
    tfidf_dt = {}
    for line in source_obj:
        word, tfidf_val = line.strip().split('&|&')
        word = word.decode('UTF-8')
        tfidf_dt[word] = float(tfidf_val)
    return tfidf_dt

def load_word_vec(file_path):
    source_obj = open(file_path, 'r')
    word_vec_dt = {}
    for i, line in enumerate(source_obj):
        if i == 0 and 'word2vec' in file_path:
            continue
        line = line.strip()
        line_lt = line.split()
        word = line_lt[0].decode('UTF-8')
        vec_list = [float(x) for x in line_lt[1:]]
        word_vec_dt[word] = vec_list
    return word_vec_dt

def get_sentence_vector(word_vec_dict, input_string_x1, tfidf_dict):
    vec_sentence = 0.0
    len_vec = len(word_vec_dict['花呗'.decode('UTF-8')])
    for word in input_string_x1:
        word_vec = word_vec_dict.get(word)
        word_tfidf = tfidf_dict.get(word)
        if word_vec is None or word_tfidf is None:
            continue
        vec_sentence += np.multiply(word_vec, word_tfidf)
    vec_sentence = vec_sentence / (np.sqrt(np.sum(np.power(vec_sentence, 2))))
    return vec_sentence

def cos_distance_bag_tfidf(input_string_x1, input_string_x2, word_vec_dict, tfidf_dict):
    sentence_vec1 = get_sentence_vector(word_vec_dict, input_string_x1, tfidf_dict)
    sentence_vec2 = get_sentence_vector(word_vec_dict, input_string_x2, tfidf_dict)
    numerator = np.sum(np.multiply(sentence_vec1, sentence_vec2))
    denominator = np.sqrt(np.sum(np.power(sentence_vec1, 2))) * np.sqrt(np.sum(np.power(sentence_vec2, 2)))
    cos_distance = float(numerator)/float(denominator)
    manhat_distance = np.sum(np.abs(np.subtract(sentence_vec1, sentence_vec2)))
    if np.isnan(manhat_distance):manhat_distance = 300
    manhat_distance = np.log(manhat_distance + 0.000001)/5.0
    
    canberra_distance = np.sum(np.abs(sentence_vec1 - sentence_vec2) / np.abs(sentence_vec2 + sentence_vec1))
    if np.isnan(canberra_distance):canberra_distance = 300
    canberra_distance = np.log(canberra_distance + 0.000001)/5.0
    
    minkow_distance = np.power(np.sum(np.power(np.abs(sentence_vec1-sentence_vec2), 3)), 0.333333)
    if np.isnan(minkow_distance):minkow_distance = 300
    minkow_distance = np.log(minkow_distance + 0.000001)/5.0
    
    euclidean_distance = np.sqrt(np.sum(np.power(sentence_vec1-sentence_vec2, 2)))
    if np.isnan(euclidean_distance):euclidean_distance = 300
    euclidean_distance = np.log(euclidean_distance + 0.000001)/5.0
    return cos_distance, manhat_distance, minkow_distance, euclidean_distance

tfidf_dict = load_tfidf_dict('../data/aux/sim_tfidf.txt')
word_vec_dict = load_word_vec('../data/aux/word2vec.txt')
fasttext_vect_dict = load_word_vec('../data/aux/fasttext.vec')

In [None]:
train_ori_data.iloc[0,:]

In [None]:
import jieba
jieba.add_word('花呗')
jieba.add_word('借呗')
jieba.add_word('收钱码')
jieba.add_word('收款码')
def cal_tfidf(csv_data):
    cos_distance_lt = []
    manhat_distance_lt = []
    minkow_distance_lt = []
    euclidean_distance_lt = []
    word_difference_ratio_lt = []
    for i in range(csv_data.shape[0]):
        id, ques1, ques2, label = list(csv_data.iloc[i,:])
        ques1 = ques1.decode('UTF-8')
        ques2 = ques2.decode('UTF-8')
        ques_lt1 = jieba.lcut(ques1)
        ques_lt2 = jieba.lcut(ques2)
        cos_distance, manhat_distance, minkow_distance, euclidean_distance = cos_distance_bag_tfidf(
            ques_lt1, ques_lt2, word_vec_dict, tfidf_dict)
        cos_distance_lt.append(cos_distance)
        manhat_distance_lt.append(manhat_distance)
        minkow_distance_lt.append(minkow_distance)
        euclidean_distance_lt.append(euclidean_distance)
    return cos_distance_lt, manhat_distance_lt, minkow_distance_lt, euclidean_distance_lt

In [None]:
cos_distance_lt, manhat_distance_lt, minkow_distance_lt, euclidean_distance_lt = cal_tfidf(train_ori_data)
save_data_word = pd.DataFrame({'cosine_distances':cos_distance_lt, 'manhat_distances':manhat_distance_lt,
                         'minkow_distances':minkow_distance_lt, 'euclidean_distances':euclidean_distance_lt})
save_data.to_csv('../tfidf_features.csv', index=False)

In [None]:
test_feas(save_data_word)

In [42]:
all_feas = pd.concat([ngram_feas[0],ngram_feas[1], lda_feas[0],lda_feas[1], simsum_feas[0], simsum_feas[1], save_data_word], axis=1)

In [44]:
y = train_ori_data.iloc[:, 3]
from sklearn.ensemble import RandomForestClassifier
def test_feas_all(X):
    Xt, Xv, yt, yv = train_test_split(X, y)
    clf = RandomForestClassifier(max_features= 'sqrt' ,n_estimators=100, oob_score = True, class_weight={0: 1.,1: 5,})
    clf.fit(Xt, yt)
    f1 = f1_score(yv, clf.predict(Xv))
    print 'weights: ', f1
test_feas(all_feas)

weights:  0.41826050537484294


In [32]:
from sklearn.grid_search import GridSearchCV
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True, class_weight={0: 1.,1: 5,}) 

param_grid = {
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(all_feas, y)



GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight={0: 1.0, 1: 5},
            criterion='gini', max_depth=None, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=True, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 700], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [33]:
print CV_rfc.best_params_

{'max_features': 'sqrt', 'n_estimators': 700}
