# Packages

In [1]:
import pandas as pd
import numpy as np
import os, re, csv, codecs, operator, sys, gc
from collections import defaultdict, OrderedDict
from tqdm import tqdm
import jieba
import seaborn as sns
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, KFold, train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Parameters

In [2]:
jieba.set_dictionary('dict.txt.big.txt')
CLEAN_WORD_PATH = None
TRAIN_DATA_FILE = 'data_original.csv'
TEST_DATA_FILE = 'test.csv'
FOLD_COUNT = 10
list_classes = ['不受理', '不成立', '成立', '當事人不到場', '聲請人撤回']

# Data Overview

In [3]:
train_df = pd.read_csv(TRAIN_DATA_FILE)

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [4]:
cleaned_comments = []
temp = train_df['調解內容與決議'].fillna('no sentence').values
for line in temp:
    if '經調解結果如左：' in line:
        a = line.split('經調解結果如左：')[0]
    else:
        a = line.split('經調解結果如下：')[0]
    cleaned_comments.append(a)
train_df['調解內容與決議'] = cleaned_comments

In [None]:
train_df.corr()

In [None]:
print(np.any(train_df.isnull()) == True)

In [5]:
train_comments = train_df['調解內容與決議'].fillna('no comment').values
train_comments_lengths = [len(str(s)) for s in train_comments]

In [None]:
def explore_comments(arr):
    print("MAX LENGTH:\t\t", np.max(arr))
    print("AVG LENGTH:\t\t", np.average(arr))
    print("MIN LENGTH:\t\t", np.min(arr))
    print("STANDARD DIVISION:\t", np.std(arr))
    print("RANGE:\t\t\t", np.min(arr), " to ", np.average(arr) + 2 * np.std(arr))
    
print("------Train------")
explore_comments(train_comments_lengths)

In [None]:
print(train_df['是否成立'].value_counts())

In [None]:
sns.set()
pd.Series(train_comments_lengths).astype(int).hist()
plt.show()

# Data Cleaning

## Transfer Lable to One-hot

In [6]:
Score = train_df['是否成立']
data = pd.get_dummies(Score)
train_df = pd.concat([train_df, data], axis=1)
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,收件日期,收件時間,結案日期,結案時間,協調開會方式,案件大類型,案件細節類型,name,...,調解人數,have_other,調解天數,縣市合併,調解內容與決議,不受理,不成立,成立,當事人不到場,聲請人撤回
0,0,9800147,2009/3/19,10:48,2009/3/19,15:00,獨任,A.民事,損害賠償,356000000000000.0,...,3,0,1,0,民國〈下同〉97年12月6日下午6時30分許，OOO駕駛皇冠交通股份有限公司所有（車號：99...,0,0,0,1,0
1,1,9800150,2009/3/19,19:49,2009/5/5,14:30,獨任,B.刑事,車禍傷害糾紛,2.68e+21,...,5,1,34,0,民國〈下同〉98年2月7日0時5分許，第3人XXX騎乘本人CE7－063號機車搭載梁信瑋與陳...,0,0,1,0,0
2,2,9800151,2009/3/23,08:50,2009/4/14,15:30,,A.民事,車禍損害賠償,278000000000000.0,...,3,0,17,0,民國〈下同〉98年1月25日19時45分許，汪國全騎乘本人所有981－CBS號機車，在臺南市...,0,0,0,1,0
3,3,9800152,2009/3/23,08:55,2009/3/31,15:00,獨任,B.刑事,車禍傷害糾紛,341000000000000.0,...,3,0,7,0,民國〈下同〉98年2月12日9時49分許，謝天智駕駛OOO所有N8－0250號自小客車，在臺...,0,0,0,0,1
4,4,9800153,2009/3/23,08:59,2009/4/16,15:00,,A.民事,財務糾紛,296000000000000.0,...,2,0,19,0,雙方因合夥投資，辦理企業貸款及退股爭議申請調解，,0,0,0,1,0


In [None]:
train_df.info()

In [None]:
x = train_df.iloc[:, 22:].sum()

plt.figure(figsize=(8, 4))
ax = sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Summary")
plt.ylabel('Occurrences', fontsize=12)
plt.xlabel('Results', fontsize=12)

rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height + 5, ha='center', va='bottom', s='{:.1f}'.format(abs(label)))
    
plt.show()

In [None]:
colormap = plt.cm.plasma
plt.figure(figsize=(10, 8))
plt.title('Correlation of features', y=1.05, size=14)
sns.heatmap(data.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
len(pd.unique(train_df['people'].fillna('no people').values)) - 1

In [None]:
new_df = train_df.apply(lambda row: row['是否成立'] == '成立' or row['是否成立'] == '不成立', axis=1)

In [None]:
df_new = train_df[new_df]
len(df_new)

In [None]:
result = pd.DataFrame()
people, ty, success, unsuc= [], [], [], []
for n in pd.unique(df_new['people'].values):
    for t in pd.unique(df_new['案件細節類型'].values):
        a = df_new[df_new['people'] == n]
        if a[a['案件細節類型'] == t]['是否成立'].empty:
            continue
        else:
            b = a[a['案件細節類型'] == t]['是否成立'].value_counts()
            people.append(n)
            ty.append(t)
            if len(b) == 2:
                success.append(b['成立'])
                unsuc.append(b['不成立'])
            else:
                try:
                    success.append(b['成立'])
                    unsuc.append('0')
                except:
                    success.append('0')
                    unsuc.append(b['不成立'])
print(len(people), len(ty), len(success), len(unsuc))
result['people'] = people
result['案件細節類型'] = ty
result['成立'] = success
result['不成立'] = unsuc

In [None]:
result.to_csv('results/success.csv', encoding='big5', index=False)

## Processing Clearning

In [7]:
def clean_text(text, remove_stopwords=False):
    # remove url
    text = re.sub(r"(https?:\/\/)*(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    
    # Special expressions
    text = re.sub(r'〈下同〉', '', text)
    text = re.sub(r'（車號：[a-zA-Z0-9－]*號）', '', text)
    text = re.sub(r'（[\' \']*）', '', text)
    text = re.sub(r'（下同）', '', text)
    text = re.sub(r'\\r\\n[0-9a-zA-Z\\r\\n]*', '', text)
    text = re.sub(r'(口)', '', text)
    
    text = re.sub(r',', '，', text)
    text = re.sub(r'\.+', '...', text)
    text = re.sub(r'\.{6}', '...', text)
    text = re.sub(r'…', '...', text)
    text = re.sub(r';', '；', text)
    text = re.sub(r'°', '。', text)
    text = re.sub(r'】', ']', text)
    text = re.sub(r'【', '[', text)
    text = re.sub(r'\)', '\）', text)
    text = re.sub(r'\(', '\（', text)
    text = re.sub(r'“', '"', text)
    text = re.sub(r' ', '', text)
    text = re.sub(r'”', '"', text)
    text = re.sub(r'～', '~', text)
    text = re.sub(r'·', '。', text)
    text = re.sub(r'!', '！', text)
    text = re.sub(r'—', '-', text)
    text = re.sub(r'》', '\）', text)
    text = re.sub(r'《', '\（', text)
    text = re.sub(r'\?', '\？', text)
    text = re.sub(r'。。。', '...', text)
    text = re.sub(r'。。。。。。', '...', text)
    text = re.sub(r':', '：', text)
    
    text = special_alpha_removal.sub('', text)
    
    return text

# regex to remove all Non-Alpha Numeric and space
special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

# regex to remove all Alpha Numeric and space
special_alpha_removal = re.compile(r'[a-z\d ]', re.IGNORECASE)

# regex to replace all numeric
replace_numbers = re.compile(r'\d+', re.IGNORECASE)

# regex to replace ###...
replace_sharp = re.compile(r'[#]+', re.IGNORECASE)

In [8]:
cleaned_train_comments = []
print('Processing data cleaning...')

for text in train_comments:
    cleaned_train_comments.append(clean_text(text))
    
train_df['cleaned_comments'] = cleaned_train_comments
print('Done!')

Processing data cleaning...
Done!


## Word segmentation analysis

In [9]:
word_dict = defaultdict(int)

for sentence in tqdm(train_df['調解內容與決議']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    for word in seg_list:
        word_dict[word] += 1
word_dict = sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)
print(len(word_dict))

  0%|          | 0/6277 [00:00<?, ?it/s]Building prefix dict from /home/eternal/下載/Competition/CL_HW/dict.txt.big.txt ...
Loading model from cache /tmp/jieba.u3a0a53d5fde3fc6cdb96c613fa12b168.cache
Loading model cost 1.193 seconds.
Prefix dict has been built succesfully.
100%|██████████| 6277/6277 [00:05<00:00, 1050.10it/s]

24176





In [10]:
cleaned_word_dict = defaultdict(int)

for sentence in tqdm(train_df['cleaned_comments']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    for word in seg_list:
        cleaned_word_dict[word] += 1
cleaned_word_dict = sorted(cleaned_word_dict.items(), key=operator.itemgetter(1), reverse=True)
print(len(cleaned_word_dict))

100%|██████████| 6277/6277 [00:03<00:00, 1723.73it/s]

16302





In [11]:
cut_sentences = []
for sentence in tqdm(train_df['cleaned_comments']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    cut_sentences.append(" ".join(seg_list))
train_df['cleaned_comments_cut'] = cut_sentences

100%|██████████| 6277/6277 [00:03<00:00, 1838.52it/s]


In [12]:
cut_train_sentences = train_df['cleaned_comments_cut']

## A quick view of vocabulary

In [13]:
chinese_list, sign_list, dig_english_list = [], [], []
for word, count in word_dict:
    for char in word:
        if char >= u'\u4E00' and char <= u"\u9FA5":
            chinese_list.append((word, count))
        elif (char >= u'\u0041' and char <= u'\u005A') or (char >= u'\u0061' and char <= u'\u007A') or (char >= u'\u0030' and char <= u'\u0039'):
            dig_english_list.append((word, count))
            break
        else:
            sign_list.append((word, count))
            break
sorted_dig_english_list = sorted(set(dig_english_list), key=lambda x: x[1], reverse=True)
sorted_sign_list = sorted(set(sign_list), key=lambda x: x[1], reverse=True)
sorted_chinese_list = sorted(set(chinese_list), key=lambda x: x[1], reverse=True)
print("chinese_word: ", len(sorted_chinese_list))
print("dig_english_word: ", len(sorted_dig_english_list))
print("sign_count: ", len(sorted_sign_list))
print(sorted_chinese_list[:10000], '\n\n', sorted_dig_english_list[:50], '\n\n', sorted_sign_list[:20])

chinese_word:  16019
dig_english_word:  8087
sign_count:  70
[('人', 14841), ('年', 8512), ('月', 8438), ('聲請', 7728), ('日', 7411), ('與', 6761), ('對造', 6365), ('下同', 6191), ('民國', 5278), ('解', 5174), ('請調', 5171), ('騎乘', 5154), ('故聲', 5008), ('機車', 4915), ('號', 4829), ('時', 4559), ('臺南市', 4410), ('車損', 4364), ('在', 4344), ('發生', 4338), ('車禍', 4114), ('北區', 3905), ('如下', 3750), ('同意', 3726), ('給付', 3634), ('不', 3608), ('結果', 3456), ('造人', 3390), ('駕駛', 3330), ('對', 3209), ('請', 3153), ('含', 2998), ('所有', 2978), ('體傷', 2743), ('重型', 2742), ('分許', 2719), ('兩造', 2666), ('本', 2453), ('汽車', 2450), ('新台幣', 2411), ('其餘', 2407), ('強制', 2394), ('路', 2359), ('責任保險', 2359), ('之', 2289), ('一', 2249), ('共計', 2239), ('調解', 2210), ('事件', 2206), ('二', 2174), ('萬元', 2171), ('民事', 2145), ('請求權', 2142), ('拋棄', 2141), ('費用', 2139), ('自小', 1934), ('他', 1879), ('及其', 1866), ('必要', 1848), ('一切', 1843), ('客車', 1758), ('追究', 1741), ('致聲', 1740), ('刑事責任', 1737), ('於', 1732), ('理賠金', 1724), ('醫療費', 1700), ('慰問金', 16

In [14]:
chinese_list, sign_list, dig_english_list = [], [], []
for word, count in cleaned_word_dict:
    for char in word:
        if char >= u'\u4E00' and char <= u"\u9FA5":
            chinese_list.append((word, count))
        elif (char >= u'\u0041' and char <= u'\u005A') or (char >= u'\u0061' and char <= u'\u007A') or (char >= u'\u0030' and char <= u'\u0039'):
            dig_english_list.append((word, count))
            break
        else:
            sign_list.append((word, count))
            break
sorted_dig_english_list = sorted(set(dig_english_list), key=lambda x: x[1], reverse=True)
sorted_sign_list = sorted(set(sign_list), key=lambda x: x[1], reverse=True)
sorted_chinese_list = sorted(set(chinese_list), key=lambda x: x[1], reverse=True)
print("chinese_word: ", len(sorted_chinese_list))
print("dig_english_word: ", len(sorted_dig_english_list))
print("sign_count: ", len(sorted_sign_list))
print(sorted_chinese_list[:10000], '\n\n', sorted_dig_english_list[:50], '\n\n', sorted_sign_list[:20])

chinese_word:  16250
dig_english_word:  0
sign_count:  52
[('人', 13487), ('聲請', 7755), ('與', 6733), ('對造', 6403), ('民國', 5276), ('解', 5174), ('請調', 5171), ('騎乘', 5153), ('故聲', 5009), ('機車', 4914), ('年月日時', 4434), ('臺南市', 4410), ('在', 4397), ('發生', 4340), ('車損', 4319), ('車禍', 4114), ('北區', 3905), ('如下', 3750), ('同意', 3726), ('號', 3683), ('給付', 3634), ('年月日', 3627), ('不', 3622), ('結果', 3456), ('駕駛', 3331), ('造人', 3266), ('對', 3169), ('請', 3117), ('含', 3010), ('所有', 2978), ('重型', 2742), ('兩造', 2666), ('分許', 2662), ('本', 2456), ('汽車', 2451), ('其餘', 2407), ('強制', 2394), ('責任保險', 2359), ('新台幣', 2324), ('體傷', 2305), ('之', 2241), ('共計', 2239), ('萬元', 2219), ('調解', 2210), ('事件', 2206), ('一', 2176), ('二', 2173), ('民事', 2145), ('請求權', 2142), ('拋棄', 2141), ('費用', 2139), ('自小', 1934), ('他', 1879), ('及其', 1866), ('必要', 1848), ('路', 1846), ('一切', 1843), ('客車', 1757), ('致聲', 1741), ('追究', 1741), ('刑事責任', 1737), ('於', 1731), ('理賠金', 1724), ('醫療費', 1700), ('慰問金', 1698), ('三', 1526), ('雙方', 1521), ('分', 

# Build Vocabulary

In [15]:
tokenizer = Tokenizer(num_words=20000, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')

In [16]:
print('Automatically train vocab & tokenizer...')
tokenizer.fit_on_texts(cut_train_sentences)

train_sequences = tokenizer.texts_to_sequences(cut_train_sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Automatically train vocab & tokenizer...
Found 16286 unique tokens


# Tf-idf Feature Extraction

## Word-Level

In [17]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 6),
    max_features=200000
)
word_vectorizer.fit(cut_train_sentences)
train_word_features = word_vectorizer.transform(cut_train_sentences)
print('Word vectorization process Done!')

Word vectorization process Done!


In [18]:
train_tfidf_features = train_word_features.tocsr()

In [19]:
train_tfidf_features.shape

(6277, 200000)

## Temporary Veiw & Processing Col Features

In [20]:
cols = list(train_df.columns)

In [21]:
data_columns = np.array([cols[3]] + cols[5:13] + cols[14:21])
label_column = np.array([cols[13]])

In [22]:
train_df['結案時間'] = [replace_sharp.sub('99:99', str(text)) for text in train_df['結案時間'].fillna(' ').values]
train_df['收件時間'] = [replace_sharp.sub('99:99', str(text)) for text in train_df['收件時間'].fillna(' ').values]

In [23]:
new_col = []
for lin in train_df['對照人'].fillna(' ').values:
    temp = "000000"
    if temp in str(lin):
        new_col.append(lin)
    else:
        new_col.append("gb")

print(len(new_col))

6277


In [24]:
feature_data = []
for column in data_columns:
    try:
        le = LabelEncoder()
        le.fit(list(train_df[column].fillna(' ').values))
        feature_data.append(le.transform(list(train_df[column].fillna(' ').values)))
    except:
        print(column)
feature_data

[array([192, 616,  74, ...,  73,  79,  88]),
 array([46, 42, 50, ..., 13, 37, 37]),
 array([2, 2, 0, ..., 2, 2, 2]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([128, 224, 226, ..., 224, 128, 224]),
 array([182, 304, 142, ..., 133, 192, 182]),
 array([275, 645, 156, ..., 259, 327, 159]),
 array([3, 3, 3, ..., 1, 1, 1]),
 array([13, 18,  0, ...,  5,  0, 14]),
 array([   0,    0,    0, ...,   47, 1176,  532]),
 array([4, 4, 4, ..., 5, 5, 5]),
 array([1, 4, 4, ..., 1, 1, 1]),
 array([2, 4, 2, ..., 1, 1, 2]),
 array([0, 1, 0, ..., 0, 0, 0]),
 array([ 69, 200, 133, ...,  70,  70,  70]),
 array([0, 0, 0, ..., 1, 1, 1])]

In [25]:
train_feature_sets = []
for line in zip(feature_data[0], feature_data[0], feature_data[1], feature_data[2], feature_data[3], feature_data[4], feature_data[5], 
                feature_data[6], feature_data[7], feature_data[8], feature_data[9], feature_data[10], feature_data[11], feature_data[12],
                feature_data[13], feature_data[14], feature_data[15]):
    train_feature_sets.append(line)
train_feature_sets = np.array(train_feature_sets)

# Model Training

## Logistic Regression Model with Tfidf Features (Multi-Lables)

In [34]:
kfold = KFold(n_splits=FOLD_COUNT, shuffle=False)
tfidf_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_tfidf_features)):
    print('## In fold {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    models = []
    for class_name in list_classes:
        print('Processing {} ...'.format(class_name))
        classifier = LogisticRegression(solver='sag', C=12.0)
        
        train_target = train_df[class_name][train_idx]
        
        classifier.fit(train_tfidf_features[train_idx], train_target)
        y_pred = classifier.predict(train_tfidf_features[train_idx])
        
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        
        val_target = train_df[class_name][test_idx]
        val_pred = classifier.predict(train_tfidf_features[test_idx])
        
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        predictions[class_name] = classifier.predict_proba(train_tfidf_features)[:, 1]
        
        models.append(classifier)
        
    tfidf_models.append(models)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

## In fold 1 ##
Processing 不受理 ...
Training accuracy is 0.9971676402903169
Validation accuracy is 0.9984076433121019
Processing 不成立 ...
Training accuracy is 0.9853071340060188
Validation accuracy is 0.7261146496815286
Processing 成立 ...
Training accuracy is 0.983713931669322
Validation accuracy is 0.6751592356687898
Processing 當事人不到場 ...
Training accuracy is 0.9867233138608603
Validation accuracy is 0.8455414012738853
Processing 聲請人撤回 ...
Training accuracy is 0.9741547176491414
Validation accuracy is 0.9554140127388535
## In fold 2 ##
Processing 不受理 ...
Training accuracy is 0.997344662772172
Validation accuracy is 0.9936305732484076
Processing 不成立 ...
Training accuracy is 0.9789343246592317
Validation accuracy is 0.7340764331210191
Processing 成立 ...
Training accuracy is 0.9743317401309967
Validation accuracy is 0.6863057324840764
Processing 當事人不到場 ...
Training accuracy is 0.9853071340060188
Validation accuracy is 0.8535031847133758
Processing 聲請人撤回 ...
Training accuracy is 0.97610196494

In [35]:
labels_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_feature_sets)):
    print('## In fold {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    models = []
    for class_name in list_classes:
        print('Processing {} ...'.format(class_name))
        classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, 
                   max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001,verbose=0)
        train_target = train_df[class_name][train_idx]
        
        classifier.fit(train_feature_sets[train_idx], train_target)
        y_pred = classifier.predict(train_feature_sets[train_idx])
        
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        
        val_target = train_df[class_name][test_idx]
        val_pred = classifier.predict(train_feature_sets[test_idx])
        
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        predictions[class_name] = classifier.predict_proba(train_feature_sets)[:, 1]
        
        models.append(classifier)
        
    labels_models.append(models)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')
print('K-fold cross validation Done!')

## In fold 1 ##
Processing 不受理 ...
Training accuracy is 0.9964595503628961
Validation accuracy is 0.9984076433121019
Processing 不成立 ...
Training accuracy is 0.7682775712515489
Validation accuracy is 0.7197452229299363
Processing 成立 ...
Training accuracy is 0.8364312267657993
Validation accuracy is 0.6369426751592356
Processing 當事人不到場 ...
Training accuracy is 0.8707735882457072
Validation accuracy is 0.85828025477707
Processing 聲請人撤回 ...
Training accuracy is 0.9433528058063374
Validation accuracy is 0.9570063694267515
## In fold 2 ##
Processing 不受理 ...
Training accuracy is 0.9969906178084617
Validation accuracy is 0.9936305732484076
Processing 不成立 ...
Training accuracy is 0.7663303239511418
Validation accuracy is 0.7388535031847133
Processing 成立 ...
Training accuracy is 0.8474066206408214
Validation accuracy is 0.6178343949044586
Processing 當事人不到場 ...
Training accuracy is 0.8727208355461143
Validation accuracy is 0.8439490445859873
Processing 聲請人撤回 ...
Training accuracy is 0.94494600814

# OOB (Out-of-Bag) Evaluation (Error ...)

In [36]:
print('Predicting training results...')

for i, model in enumerate(tfidf_models):
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    for j, class_name in enumerate(list_classes):
        predictions[class_name] = model[j].predict_proba(train_tfidf_features)[:, 1]
    
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
for i, model in enumerate(labels_models):
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    for j, class_name in enumerate(list_classes):
        predictions[class_name] = model[j].predict_proba(train_feature_sets)[:, 1]
        
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')

print('Done!')

Predicting training results...
Done!


In [37]:
result_list = ['results/Submission_file_{}.csv'.format(i) for i in range(0, FOLD_COUNT * 2)]
bagging(result_list, 'results/bagging.csv')
print('Bagging operation Done!')

Doing ensemble on
results/Submission_file_0.csv
results/Submission_file_1.csv
results/Submission_file_2.csv
results/Submission_file_3.csv
results/Submission_file_4.csv
results/Submission_file_5.csv
results/Submission_file_6.csv
results/Submission_file_7.csv
results/Submission_file_8.csv
results/Submission_file_9.csv
results/Submission_file_10.csv
results/Submission_file_11.csv
results/Submission_file_12.csv
results/Submission_file_13.csv
results/Submission_file_14.csv
results/Submission_file_15.csv
results/Submission_file_16.csv
results/Submission_file_17.csv
results/Submission_file_18.csv
results/Submission_file_19.csv
Bagging operation Done!


In [38]:
test_df = pd.read_csv('results/bagging.csv', encoding='big5')
result_label = test_df[list_classes]
results = result_label.idxmax(axis=1)

In [39]:
val_target = train_df['是否成立']
print('Validation accuracy is {}'.format(accuracy_score(results, val_target)))

Validation accuracy is 0.8988370240560778


In [46]:
print(len(results), len(val_target))

6277 6277


In [47]:
# Confusion Matrix
# '不受理': 0, '不成立': 1, '成立': 2, '當事人不到場': 3, '聲請人撤回': 4
TP_0, TP_1, TP_2, TP_3, TP_4 = 0, 0, 0, 0, 0
C_0, C_1, C_2, C_3, C_4 = 0, 0, 0, 0, 0
for i in range(len(results)):
    if val_target[i] == '不受理':
        C_0 += 1
        if results[i] == '不受理':
            TP_0 += 1
    elif val_target[i] == '不成立':
        C_1 += 1
        if results[i] == '不成立':
            TP_1 += 1
    elif val_target[i] == '成立':
        C_2 += 1
        if results[i] == '成立':
            TP_2 += 1
    elif val_target[i] == '當事人不到場':
        C_3 += 1
        if results[i] == '當事人不到場':
            TP_3 += 1
    else:
        C_4 += 1
        if results[i] == '聲請人撤回':
            TP_4 += 1

In [48]:
print('不受理: {}/{}, 不成立: {}/{}, 成立: {}/{}, 當事人不到場: {}/{}, 聲請人撤回: {}/{}'.format(TP_0, C_0, TP_1, C_1, TP_2, C_2, TP_3, C_3, TP_4, C_4))

不受理: 2/21, 不成立: 1225/1497, 成立: 3575/3580, 當事人不到場: 668/832, 聲請人撤回: 172/347


In [49]:
print('accuracy: 不受理: {}, 不成立: {}, 成立: {}, 當事人不到場: {}, 聲請人撤回: {}'.format(TP_0 / C_0, TP_1 / C_1, TP_2 / C_2, TP_3 / C_3, TP_4 / C_4))

accuracy: 不受理: 0.09523809523809523, 不成立: 0.8183032732130928, 成立: 0.9986033519553073, 當事人不到場: 0.8028846153846154, 聲請人撤回: 0.4956772334293948


## Multi-Classification

In [None]:
train_label = train_df['是否成立']

In [None]:
tfidf_models = []
kfold = StratifiedKFold(n_splits=FOLD_COUNT, shuffle=False)
multi_classifier_tfidf_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_tfidf_features, train_label)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(solver='sag', C=12.0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    train_target = train_label[train_idx]
    classifier.fit(train_tfidf_features[train_idx], train_target)
    y_pred = classifier.predict(train_tfidf_features[train_idx])

    print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))

    val_target = train_label[test_idx]
    val_pred = classifier.predict(train_tfidf_features[test_idx])

    print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        
    multi_classifier_tfidf_models.append(classifier)
    predictions = classifier.predict(train_tfidf_features)
        
    tfidf_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/multi/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

In [None]:
label_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_feature_sets, train_label)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, 
                   max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001,verbose=0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    train_target = train_label[train_idx]

    classifier.fit(train_feature_sets[train_idx], train_target)
    y_pred = classifier.predict(train_feature_sets[train_idx])

    print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))

    val_target = train_label[test_idx]
    val_pred = classifier.predict(train_feature_sets[test_idx])

    print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        
    multi_classifier_tfidf_models.append(classifier)
    predictions = classifier.predict(train_feature_sets)
        
    label_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/multi/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

# ExtraTreesClassifier

In [None]:
accs = []
et_predictions = OrderedDict()
et_predictions['id'] = test_df['id']

for class_name in list_classes:
    train_target = train_df[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_score = np.mean(cross_val_score(classifier, train_tfidf_features, train_target, cv=10, scoring='roc_auc'))
    accs.append(cv_score)
    print('CV Score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_tfidf_features, train_target)
    et_predictions[class_name] = classifier.predict_proba(test_tfidf_features)[:, 1]
    
submission = pd.DataFrame.from_dict(et_predictions)
submission.to_csv('result/LR_Based/ExtraTreesClassifier_Submission.csv', index=False)

# Predictions (Optional)

In [None]:
for i, model in enumerate(tfidf_models):
    print('## In Model {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = test_df['id']
    
    for class_name in list_classes:
        predictions[class_name] = model.predict_proba(test_tfidf_features)[:, 1]
        print('Predict the proba for {} Done!'.format(class_name))
        print(predictions.keys())
    
    print(predictions.keys())
    submission = pd.DataFrame.from_dict(predictions)
    submission.to_csv('Logistic_Regression_Submission_{}.csv'.format(i), index=False)

# Result Ensemble

In [30]:
def bagging(arrs, path):
    print("Doing ensemble on")
    subs = []
    for arr in arrs:
        print(arr)
        subs.append(pd.read_csv(arr, encoding='big5'))
    
    for sub in subs[1:]:
        for c in list_classes:
            subs[0][c] += sub[c]
    
    for c in list_classes:
        subs[0][c] /= len(subs)
        
    subs[0].to_csv(path, index=False, encoding='big5')