In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from datetime import datetime,timedelta  
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline



In [2]:
path='./input/'
data=pd.DataFrame()
sex_age=pd.read_excel('./input/性别年龄对照表.xlsx')

In [3]:
deviceid_packages=pd.read_csv(path+'deviceid_packages.tsv',sep='\t',names=['device','apps'])
deviceid_test=pd.read_csv(path+'deviceid_test.tsv',sep='\t',names=['device'])
deviceid_train=pd.read_csv(path+'deviceid_train.tsv',sep='\t',names=['device','sex','age'])

In [4]:
deviceid_train=pd.concat([deviceid_train,deviceid_test])

In [5]:
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_lenghth']=deviceid_packages['apps'].apply(lambda x:len(x))

In [6]:
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf=transformer.fit_transform(cntTf)
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
df_weight=pd.DataFrame(weight)
feature=df_weight.columns
df_weight['sum']=0
for f in tqdm(feature):
    df_weight['sum']+=df_weight[f]
deviceid_packages['tfidf_sum']=df_weight['sum']

100%|██████████| 35000/35000 [01:52<00:00, 312.20it/s]


In [7]:
lda = LatentDirichletAllocation(n_topics=5,
                                learning_offset=50.,
                                random_state=666)
docres = lda.fit_transform(cntTf)



In [8]:
deviceid_packages=pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)

In [9]:
temp=deviceid_packages.drop('apps',axis=1)
deviceid_train=pd.merge(deviceid_train,temp,on='device',how='left')

In [10]:
deviceid_train['sex']=deviceid_train['sex'].apply(lambda x:str(x))
deviceid_train['age']=deviceid_train['age'].apply(lambda x:str(x))
def tool(x):
    if x=='nan':
        return x
    else:
        return str(int(float(x)))
deviceid_train['sex']=deviceid_train['sex'].apply(tool)
deviceid_train['age']=deviceid_train['age'].apply(tool)
deviceid_train['sex_age']=deviceid_train['sex']+'-'+deviceid_train['age']

In [11]:
deviceid_train=deviceid_train.replace({'nan':np.NaN,'nan-nan':np.NaN})

In [12]:
deviceid_train.head()

Unnamed: 0,age,device,sex,app_lenghth,tfidf_sum,0,1,2,3,4,sex_age
0,3,bd86d59afa24a839ce6029d718accb19,1,16,3.457704,0.089292,0.01177,0.808878,0.01186,0.0782,1-3
1,5,e7d158c9a8262a35c9cc630a15a9103e,1,12,3.037932,0.015437,0.092414,0.646381,0.015462,0.230306,1-5
2,5,97abdc3828448b5acc7428dd307bc635,2,3,1.538901,0.050341,0.05,0.798611,0.050298,0.050751,2-5
3,4,e4dbdbf07c9cff03d79f4872e65742b4,1,5,2.221036,0.034773,0.033333,0.863134,0.034388,0.034372,1-4
4,3,6bd4537b2970c5c6ab765c1860b88aa5,1,21,4.114661,0.102307,0.009092,0.278817,0.375586,0.234198,1-3


In [13]:
#New add
#deviceid_train.rename({'device_id':'device'}, axis=1, inplace=True)

from  tiny.util import *
deviceid_train = extend_feature(version='1',span_no=4, input=deviceid_train)

print(len(deviceid_train))
deviceid_train.groupby('max_day_cnt')['max_day_cnt'].count()
#deviceid_train.head()

#deviceid_train.rename({'device':'device_id'}, axis=1, inplace=True)

2018-09-09 21:55:06,633 util_log.py[29] INFO Begin to run extend_percent with:[], {}
2018-09-09 21:55:06,636 util_cache_file.py[60] DEBUG fn:extend_percent, para:['1', 4], kw:{}
2018-09-09 21:55:06,637 util_cache_file.py[21] DEBUG try to read cache from file:./cache/extend_percent_['1', 4]_{}.csv
2018-09-09 21:55:07,079 util_cache_file.py[28] DEBUG Return 72727 resut from file cache:./cache/extend_percent_['1', 4]_{}.csv
2018-09-09 21:55:07,080 util_log.py[34] INFO cost: 0.45 sec: ==='extend_percent' end ([], {}) 
2018-09-09 21:55:07,082 util_log.py[29] INFO Begin to run extend_cols with:[], {}
2018-09-09 21:55:07,083 util_log.py[29] INFO Begin to run get_brand with:[], {}
2018-09-09 21:55:07,167 util_log.py[29] INFO Begin to run convert_label_encode with:[['device']], {}
2018-09-09 21:55:07,258 util_log.py[34] INFO cost: 0.09 sec: ==='convert_label_encode' end ([['device']], {}) 
2018-09-09 21:55:07,260 util_log.py[34] INFO cost: 0.18 sec: ==='get_brand' end ([], {}) 
2018-09-09 21:55

72727


max_day_cnt
1     3227
2     3842
3     4258
4     4268
5     5623
6     9852
7    41657
Name: max_day_cnt, dtype: int64

In [14]:
train=deviceid_train[deviceid_train['sex'].notnull()]
test=deviceid_train[deviceid_train['sex'].isnull()]

In [15]:
X=train.drop(['sex','age','sex_age','device'],axis=1)
Y=train['sex_age']
Y_CAT=pd.Categorical(Y)
X_train,X_test, y_train, y_test =train_test_split(X,Y_CAT.labels,test_size=0.3, random_state=666)
lgb_train=lgb.Dataset(X_train,label=y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
params = {
    'boosting_type': 'gbdt',
    'max_depth':3,
    'metric': {'multi_logloss'},
    'num_class':22,
    'objective':'multiclass',
    'random_state':666,
    
}

  after removing the cwd from sys.path.


In [16]:
gbm = lgb.train(params,
        lgb_train,
        num_boost_round=1000,
        valid_sets=lgb_eval,
        early_stopping_rounds=300)

[1]	valid_0's multi_logloss: 3.05522
Training until validation scores don't improve for 300 rounds.
[2]	valid_0's multi_logloss: 3.0241
[3]	valid_0's multi_logloss: 2.99724
[4]	valid_0's multi_logloss: 2.97323
[5]	valid_0's multi_logloss: 2.95206
[6]	valid_0's multi_logloss: 2.93309
[7]	valid_0's multi_logloss: 2.91645
[8]	valid_0's multi_logloss: 2.90147
[9]	valid_0's multi_logloss: 2.88771
[10]	valid_0's multi_logloss: 2.87512
[11]	valid_0's multi_logloss: 2.86366
[12]	valid_0's multi_logloss: 2.85332
[13]	valid_0's multi_logloss: 2.84386
[14]	valid_0's multi_logloss: 2.83506
[15]	valid_0's multi_logloss: 2.82694
[16]	valid_0's multi_logloss: 2.81953
[17]	valid_0's multi_logloss: 2.81288
[18]	valid_0's multi_logloss: 2.80679
[19]	valid_0's multi_logloss: 2.80099
[20]	valid_0's multi_logloss: 2.79569
[21]	valid_0's multi_logloss: 2.79097
[22]	valid_0's multi_logloss: 2.78637
[23]	valid_0's multi_logloss: 2.78231
[24]	valid_0's multi_logloss: 2.77835
[25]	valid_0's multi_logloss: 2.774

[218]	valid_0's multi_logloss: 2.73379
[219]	valid_0's multi_logloss: 2.73391
[220]	valid_0's multi_logloss: 2.734
[221]	valid_0's multi_logloss: 2.73418
[222]	valid_0's multi_logloss: 2.73422
[223]	valid_0's multi_logloss: 2.73438
[224]	valid_0's multi_logloss: 2.73441
[225]	valid_0's multi_logloss: 2.73447
[226]	valid_0's multi_logloss: 2.73446
[227]	valid_0's multi_logloss: 2.73449
[228]	valid_0's multi_logloss: 2.7347
[229]	valid_0's multi_logloss: 2.73487
[230]	valid_0's multi_logloss: 2.735
[231]	valid_0's multi_logloss: 2.73504
[232]	valid_0's multi_logloss: 2.73509
[233]	valid_0's multi_logloss: 2.73517
[234]	valid_0's multi_logloss: 2.73519
[235]	valid_0's multi_logloss: 2.73518
[236]	valid_0's multi_logloss: 2.73523
[237]	valid_0's multi_logloss: 2.7353
[238]	valid_0's multi_logloss: 2.73536
[239]	valid_0's multi_logloss: 2.73554
[240]	valid_0's multi_logloss: 2.7356
[241]	valid_0's multi_logloss: 2.73571
[242]	valid_0's multi_logloss: 2.73582
[243]	valid_0's multi_logloss: 2

In [17]:
pre_x=test.drop(['sex','age','sex_age','device_id'],axis=1)
sub=pd.DataFrame(gbm.predict(pre_x.values,num_iteration=gbm.best_iteration))

ValueError: labels ['device_id'] not contained in axis

In [None]:
sub.columns=Y_CAT.categories
sub['DeviceID']=test['device_id'].values
sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]

In [None]:
sub.to_csv('./sub/baseline.csv',index=False)