In [183]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

import re

title = 'woplus'
path = '../../sources/data'

In [141]:
df_net = pd.read_table('{}/dataset_2015_filter_net.txt'.format(path), sep='|')
df_net_piv = df_net.pivot(index='imsi', columns='mon')
df_net_piv.columns = ['net01','net02','net03','net04','net05','net06','net07','net08','net09','net10','net11','net12']

df_age = pd.read_table('{}/dataset_2015_filter_age.txt'.format(path), sep='|')
df_age_piv = df_age.pivot(index='imsi', columns='mon')
df_age_piv.columns = ['age01','age02','age03','age04','age05','age06','age07','age08','age09','age10','age11','age12']

df_sex = pd.read_table('{}/dataset_2015_filter_sex.txt'.format(path), sep='|')
df_sex_piv = df_sex.pivot(index='imsi', columns='mon')
df_sex_piv.columns = ['sex01','sex02','sex03','sex04','sex05','sex06','sex07','sex08','sex09','sex10','sex11','sex12']

df_arpu = pd.read_table('{}/dataset_2015_filter_arpu.txt'.format(path), sep='|')
df_arpu_piv = df_arpu.pivot(index='imsi', columns='mon')
df_arpu_piv.columns = ['arpu01','arpu02','arpu03','arpu04','arpu05','arpu06','arpu07','arpu08','arpu09','arpu10','arpu11','arpu12']

df_stream = pd.read_table('{}/dataset_2015_filter_stream.txt'.format(path), sep='|')
df_stream_piv = df_stream.pivot(index='imsi', columns='mon')
df_stream_piv.columns = ['stream01','stream02','stream03','stream04','stream05','stream06','stream07','stream08','stream09','stream10','stream11','stream12']

df_sms = pd.read_table('{}/dataset_2015_filter_sms.txt'.format(path), sep='|')
df_sms_piv = df_sms.pivot(index='imsi', columns='mon')
df_sms_piv.columns = ['sms01','sms02','sms03','sms04','sms05','sms06','sms07','sms08','sms09','sms10','sms11','sms12']

df_talklen = pd.read_table('{}/dataset_2015_filter_talklen.txt'.format(path), sep='|')
df_talklen_piv = df_talklen.pivot(index='imsi', columns='mon')
df_talklen_piv.columns = ['talklen01','talklen02','talklen03','talklen04','talklen05','talklen06','talklen07','talklen08','talklen09','talklen10','talklen11','talklen12']

df_me = pd.read_table('{}/dataset_2015_filter_me.txt'.format(path), sep='|')
df_me_piv = df_me.pivot(index='imsi', columns='mon')
df_me_piv.columns = ['brand01','brand02','brand03','brand04','brand05','brand06','brand07','brand08','brand09','brand10','brand11','brand12',\
                     'type01','type02','type03','type04','type05','type06','type07','type08','type09','type10','type11','type12']

## Data Summary

In [135]:
def ValueCounts(das, nhead=5):
    tmp = pd.value_counts(das).reset_index().rename_axis({'index': das.name}, axis=1)
    value = pd.DataFrame(['Value {}'.format(x+1) for x in np.arange(nhead)], index=np.arange(nhead)).join(tmp.iloc[:, 0], how='left').set_index(0).T
    freq = pd.DataFrame(['Freq {}'.format(x+1) for x in np.arange(nhead)], index=np.arange(nhead)).join(tmp.iloc[:, 1], how='left').set_index(0).T
    nnull = das.isnull().sum()
    freqother = pd.DataFrame({das.name: [das.shape[0] - nnull - np.nansum(freq.values), nnull]}, index=['Freq Others','Freq NA']).T
    op = pd.concat([value, freq, freqother], axis=1)
    return op

def Summary(da):
    op = pd.concat([pd.DataFrame({'type':da.dtypes, 'nnullCount':da.notnull().sum(axis=0)}), da.describe().T.iloc[:, 1:],
          pd.concat(map(lambda i: ValueCounts(da.loc[:,i]), da.columns))], axis=1).loc[da.columns]
    op.index.name='columns'
    return op

## Category Feature's Bin with OHC

In [145]:
def Cat_to_bin(das, a = 0.01):
    '''Transfrom a categorical column to onehotencoding'''
    tmp = pd.value_counts(das)/das.shape[0]
    cat = list(tmp.index[tmp > a])
    cat.sort()   # 改良：使用cat前对cat进行一次排序，保证顺序一致
    # print cat
    enc = OneHotEncoder(n_values = len(cat)+1, sparse = False)
    xbin = enc.fit_transform(np.transpose(
            [das.astype("category").cat.set_categories(cat).cat.rename_categories(1+np.arange(len(cat))).astype("float").fillna(0).values]))[:,1:]     
    dabin = pd.DataFrame(xbin, columns = ["{}_{}".format(das.name, x) for x in cat], index = das.index)    # origin
    # dabin = pd.DataFrame(xbin, columns = ["{}_{}".format(das.name, x) for x in (1+np.arange(len(cat)))], index = das.index) 
    if(tmp[tmp <= a].sum() > a):
        dabin = pd.concat([dabin, pd.DataFrame({"{}_Others".format(das.name):das.notnull()-dabin.sum(axis = 1)})], axis = 1)
    if(dabin.shape[1] == 2):
        dabin = pd.DataFrame({das.name: xbin[:,0]}, index = das.index)
    return(dabin)

def CattoBin(da, a = 0.01):
    op = pd.concat(map(lambda i: Cat_to_bin(da.loc[:, i], a), da.columns), axis=1)
    return op

## Numerical Feature's Standardalization with StandardScaler

In [151]:
def SSTraining(da):
    ss = StandardScaler().fit(da)
    # ss.mean_
    # ss.scale_
    op = pd.DataFrame(ss.transform(da), index=da.index, columns=da.columns)
    return op, ss

## Label's Generation

In [207]:
def isChangingStrict(dl):
    return len(np.unique(dl))

def labelGenerate(df, Mon1, Mon2):
    df_phone = df.loc[:,['brand'+str(Mon1),'type'+str(Mon1),'brand'+str(Mon2),'type'+str(Mon2)]]
    df_phone_label_strict = df_phone.apply(isChangingStrict, axis=1)
    df_phone_label_strict[df_phone_label_strict < 3] = 0
    df_phone_label_strict[df_phone_label_strict >=3] = 1
    return pd.DataFrame(df_phone_label_strict, columns=['Flag'])

In [233]:
Summary(df_talklen_piv)

Unnamed: 0_level_0,nnullCount,type,mean,std,min,25%,50%,75%,max,Value 1,...,Value 3,Value 4,Value 5,Freq 1,Freq 2,Freq 3,Freq 4,Freq 5,Freq Others,Freq NA
columns,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
talklen01,360698,int64,287.699458,436.873262,0.0,27.0,141.0,366.0,11334.0,0,...,2,3,4,52098,3128,2469,2059,1917,299027,0
talklen02,360698,int64,226.66775,333.389978,0.0,20.0,116.0,299.0,10840.0,0,...,2,3,4,56120,3296,2691,2250,2164,294177,0
talklen03,360698,int64,294.181448,438.814782,0.0,26.0,146.0,379.0,12029.0,0,...,2,3,4,53716,2908,2421,1974,1911,297768,0
talklen04,360698,int64,287.63083,438.85053,0.0,25.0,139.0,365.0,13673.0,0,...,2,3,4,54626,2974,2428,1977,1960,296733,0
talklen05,360698,int64,278.399794,422.364847,0.0,23.0,135.0,356.0,11435.0,0,...,2,3,4,56049,3037,2525,2105,1917,295065,0
talklen06,360698,int64,273.134888,417.601984,0.0,21.0,132.0,349.0,13807.0,0,...,2,3,4,58391,3194,2625,2079,1889,292520,0
talklen07,360698,int64,271.608856,422.116105,0.0,19.0,128.0,345.0,15101.0,0,...,2,3,4,60478,3226,2549,2135,1923,290387,0
talklen08,360698,int64,262.327634,411.734707,0.0,16.0,121.0,334.0,14039.0,0,...,2,3,4,62223,3450,2734,2293,2086,287912,0
talklen09,360698,int64,263.363246,413.972533,0.0,15.0,122.0,336.0,11631.0,0,...,2,3,4,64338,3326,2586,2196,2073,286179,0
talklen10,360698,int64,250.354102,392.226581,0.0,13.0,116.0,320.0,12666.0,0,...,2,3,4,66078,3395,2638,2187,2067,284333,0


In [237]:
# deal with Category Features
net_catbin = CattoBin(df_net_piv)
age_catbin = CattoBin(df_age_piv)
sex_catbin = CattoBin(df_sex_piv)
arpu_catbin = CattoBin(df_arpu_piv)
stream_catbin = CattoBin(df_stream_piv)

# deal with Numerical Features
sms_ss, sscaler_sms = SSTraining(df_sms_piv)
talklen_ss, sscaler_talklen = SSTraining(df_talklen_piv)

# concat
df_features = pd.concat([net_catbin, age_catbin, sex_catbin, arpu_catbin, stream_catbin, sms_ss, talklen_ss], axis=1)

## Extract selected month's Features from df_feat ( after concat )

In [238]:
def extractFeatures(da, mon):
    featList = da.columns.tolist()
    op = da.loc[:, map(lambda i: re.findall('\d{2}',featList[i])[0] == str(mon), np.arange(len(featList)))]
    return op

## Generate TrainSet

In [239]:
def genTrainSet(mon):
    tsFeat = extractFeatures(df_features, mon)
    tsLabel = labelGenerate(df_me_piv, mon, mon+1)
    op = pd.concat([tsFeat, tsLabel], axis=1)
    return op

In [240]:
df_train_10 = genTrainSet(10)

In [242]:
m, n = np.shape(df_train_10)
X = df_train_10.iloc[:, :(n-1)].values
y = df_train_10.iloc[:, -1].values

In [260]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [244]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# np.shape(X_train)
# np.shape(X_test)

In [267]:
# BaseLine Score
LR = LogisticRegression()
LR.fit(X_train, y_train)
yHat = LR.predict(X_train)
yScore = LR.predict_proba(X_train)
y_testHat = LR.predict(X_test)
y_testScore = LR.predict_proba(X_test)
print u'训练集准确度：%.5f' % (np.mean(yHat == y_train))
print u'训练集AUC：%.5f'% roc_auc_score(y_train, yScore[:,1])
print classification_report(y_train, yHat)

print u'测试集准确度：%.2f' % (np.mean(y_testHat == y_test)) 
print u'测试集AUC：%.5f'% roc_auc_score(y_test, y_testScore[:,1])
print classification_report(y_test, y_testHat)

训练集准确度：0.70492
训练集AUC：0.63216
             precision    recall  f1-score   support

          0       0.70      1.00      0.83    177983
          1       0.00      0.00      0.00     74505

avg / total       0.50      0.70      0.58    252488

测试集准确度：0.70
测试集AUC：0.62638
             precision    recall  f1-score   support

          0       0.70      1.00      0.83     76198
          1       0.00      0.00      0.00     32012

avg / total       0.50      0.70      0.58    108210

