In [1]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import matplotlib as mp
from datetime import datetime
import os
import gc
from tqdm import tqdm
tqdm.pandas()
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation,TruncatedSVD
import matplotlib.pyplot as plt

In [2]:
#uploading data in dataframe
train=pd.read_csv("../input/train.csv",sep=',')
test=pd.read_csv("../input/test.csv",sep=',')
train_y = train['target']

In [3]:
def add_features(df):
    df['question_text'] = df['question_text'].progress_apply(lambda x: str(x))
    # 字符串的长度，字母的数量
    df['num_chars'] = df['question_text'].progress_apply(len)
    # 单词的数量
    df['num_words'] = df.question_text.str.count('\S+')
    # 字符串中大写字母的数量https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-qiqc
    df['num_capital'] = df['question_text'].progress_apply(lambda x: sum(1 for c in x if c.isupper()))
    # 大写字母数占总字母数的比率
    df['capital_rate'] = df['num_capital'] / df['num_chars']
    
    # 不重复单词的种数
    df['num_uniquewords'] = df['question_text'].progress_apply(lambda x: len(set(x.split())))
    df['unique_rate'] = df['num_uniquewords'] / df['num_words']
    
    # istitle()字符串中所有单词首字母大写则为真，也就是统计首字母大写的单次数
    df["num_titlewords"] = df["question_text"].progress_apply(lambda x: len([w for w in x.split() if w.istitle()]))
    # 词频
    df['title_rate'] = df['num_titlewords'] / df['num_words']
    
    # 字符串中所有字母大写则为真
    df["num_upperwords"] = df["question_text"].progress_apply(lambda x: len([w for w in x.split() if w.isupper()]))
    df['upper_rate'] = df['num_upperwords'] / df['num_words']
    
    # 统计“！”的数目
    df["num_exc"] = df["question_text"].progress_apply(lambda x: x.count("!")).astype('uint16')
    # 统计“？”的数目
    df["num_q"] = df['question_text'].progress_apply(lambda x: x.count("?")).astype('uint16')
    # 单词长度的平均值
    df["mean_word_len"] = df["question_text"].progress_apply(lambda x: np.mean([len(w) for w in x.split()]))
    # 单词长度的最大值
    df["max_word_len"] = df['question_text'].progress_apply(lambda x: max([len(w) for w in x.split()]))

    return df
train = add_features(train)
test = add_features(test)

100%|██████████| 1306122/1306122 [00:01<00:00, 1228545.03it/s]
100%|██████████| 1306122/1306122 [00:01<00:00, 1225738.50it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 222097.71it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 407125.90it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 417413.26it/s]
100%|██████████| 1306122/1306122 [00:02<00:00, 442215.37it/s]
100%|██████████| 1306122/1306122 [00:01<00:00, 984290.01it/s] 
100%|██████████| 1306122/1306122 [00:01<00:00, 980635.42it/s] 
100%|██████████| 1306122/1306122 [00:18<00:00, 72539.00it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 395582.54it/s]
100%|██████████| 56370/56370 [00:00<00:00, 944515.13it/s]
100%|██████████| 56370/56370 [00:00<00:00, 1190336.24it/s]
100%|██████████| 56370/56370 [00:00<00:00, 222542.06it/s]
100%|██████████| 56370/56370 [00:00<00:00, 405488.97it/s]
100%|██████████| 56370/56370 [00:00<00:00, 404496.26it/s]
100%|██████████| 56370/56370 [00:00<00:00, 438829.35it/s]
100%|██████████| 56370/56370

In [5]:
feat_list = train.columns.tolist()
feat_list = [feat for feat in feat_list if feat not in ['qid','question_text','target']]

train_x = train[feat_list]

In [3]:
begin = datetime.now()
tfidf_v = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features=90000)
matrixTFIDF= tfidf_v.fit_transform(train.question_text)
# matrixTFIDF= tfidf_v.fit_transform(train[train.target==1].question_text)

svd=TruncatedSVD(n_components=30, n_iter=10,random_state=42)
X=svd.fit_transform(matrixTFIDF) 
print('time used:',datetime.now() - begin)

# begin = datetime.now()
# lda=LatentDirichletAllocation(n_components=15,random_state=42)
# Z=lda.fit_transform(matrixTFIDF)  
# print('time used:',datetime.now() - begin)

time used: 0:02:01.751828


In [10]:
svd=TruncatedSVD(n_components=40, n_iter=10,random_state=42)
X=svd.fit_transform(matrixTFIDF) 

In [4]:
def F1_best_score(y_true, y_pred):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(50)]:
        score = f1_score(y_true=y_true, y_pred=(y_pred > threshold).astype(int))
        if score > best_score:
            best_threshold = threshold
            best_score = score
    return "my_score", best_score, True

In [11]:
def train_single():
    X_train, X_val, y_train, y_val = train_test_split(X, train_y, test_size=0.2,random_state=2018, stratify=train_y)
    clf = lgb.LGBMClassifier(learning_rate=0.05,objective='binary',reg_alpha=0,num_leaves =32,
                             subsample=0.8, colsample_bytree=1, n_estimators=2000,
                             early_stopping_round=100)
    
    clf.fit(X_train, y_train, eval_set=[(X_train, y_train),(X_val,y_val)], 
            verbose=50, eval_metric=['auc'], early_stopping_rounds=100)
    pred_val_y = clf.predict_proba(X_val,num_iteration=clf.best_iteration_)[:,1]
#     pred_test_y = clf.predict_proba(test_tfidf,num_iteration=clf.best_iteration_)[:,1]

    best_threshold = 0
    best_score = 0

    for threshold in [i * 0.01 for i in range(50)]:
        score = f1_score(y_val, (pred_val_y>threshold).astype(int))
        if score > best_score:
            best_threshold = threshold
            best_score = score
            
    print('best score:%f,best threshold:%f'%(best_score, best_threshold))
    gc.collect()
    return 

In [12]:
train_single()



Training until validation scores don't improve for 100 rounds.
[50]	valid_0's auc: 0.875307	valid_0's binary_logloss: 0.168805	valid_1's auc: 0.870797	valid_1's binary_logloss: 0.170171
[100]	valid_0's auc: 0.887616	valid_0's binary_logloss: 0.161096	valid_1's auc: 0.881881	valid_1's binary_logloss: 0.16322
[150]	valid_0's auc: 0.894428	valid_0's binary_logloss: 0.157291	valid_1's auc: 0.887475	valid_1's binary_logloss: 0.160137
[200]	valid_0's auc: 0.899094	valid_0's binary_logloss: 0.154771	valid_1's auc: 0.890879	valid_1's binary_logloss: 0.158303
[250]	valid_0's auc: 0.902738	valid_0's binary_logloss: 0.152901	valid_1's auc: 0.892823	valid_1's binary_logloss: 0.157254
[300]	valid_0's auc: 0.90607	valid_0's binary_logloss: 0.151276	valid_1's auc: 0.894293	valid_1's binary_logloss: 0.156478
[350]	valid_0's auc: 0.908695	valid_0's binary_logloss: 0.149925	valid_1's auc: 0.895267	valid_1's binary_logloss: 0.155957
[400]	valid_0's auc: 0.911395	valid_0's binary_logloss: 0.148562	valid_1