# Packages

In [1]:
import pandas as pd
import numpy as np
import os, re, csv, codecs, operator, sys, gc
from collections import defaultdict, OrderedDict
from tqdm import tqdm
import jieba
import seaborn as sns
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, KFold, train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


# Parameters

In [2]:
jieba.set_dictionary('dict.txt.big.txt')
CLEAN_WORD_PATH = None
TRAIN_DATA_FILE = 'data_original.csv'
TEST_DATA_FILE = 'test.csv'
FOLD_COUNT = 10
list_classes = ['不受理', '不成立', '成立', '當事人不到場', '聲請人撤回']

# Data Overview

In [3]:
train_df = pd.read_csv(TRAIN_DATA_FILE)

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
cleaned_comments = []
temp = train_df['調解內容與決議'].fillna('no sentence').values
for line in temp:
    if '經調解結果如左：' in line:
        a = line.split('經調解結果如左：')[0]
    else:
        a = line.split('經調解結果如下：')[0]
    cleaned_comments.append(a)
train_df['調解內容與決議'] = cleaned_comments

In [None]:
train_df.corr()

In [None]:
print(np.any(train_df.isnull()) == True)

In [4]:
train_comments = train_df['調解內容與決議'].fillna('no comment').values
train_comments_lengths = [len(str(s)) for s in train_comments]

In [None]:
def explore_comments(arr):
    print("MAX LENGTH:\t\t", np.max(arr))
    print("AVG LENGTH:\t\t", np.average(arr))
    print("MIN LENGTH:\t\t", np.min(arr))
    print("STANDARD DIVISION:\t", np.std(arr))
    print("RANGE:\t\t\t", np.min(arr), " to ", np.average(arr) + 2 * np.std(arr))
    
print("------Train------")
explore_comments(train_comments_lengths)

In [None]:
print(train_df['是否成立'].value_counts())

In [None]:
sns.set()
pd.Series(train_comments_lengths).astype(int).hist()
plt.show()

# Data Cleaning

## Transfer Lable to One-hot

In [5]:
Score = train_df['是否成立']
data = pd.get_dummies(Score)
train_df = pd.concat([train_df, data], axis=1)
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,收件日期,收件時間,結案日期,結案時間,協調開會方式,案件大類型,案件細節類型,name,...,調解人數,have_other,調解天數,縣市合併,調解內容與決議,不受理,不成立,成立,當事人不到場,聲請人撤回
0,0,9800147,2009/3/19,10:48,2009/3/19,15:00,獨任,A.民事,損害賠償,356000000000000.0,...,3,0,1,0,民國〈下同〉97年12月6日下午6時30分許，OOO駕駛皇冠交通股份有限公司所有（車號：99...,0,0,0,1,0
1,1,9800150,2009/3/19,19:49,2009/5/5,14:30,獨任,B.刑事,車禍傷害糾紛,2.68e+21,...,5,1,34,0,民國〈下同〉98年2月7日0時5分許，第3人XXX騎乘本人CE7－063號機車搭載梁信瑋與陳...,0,0,1,0,0
2,2,9800151,2009/3/23,08:50,2009/4/14,15:30,,A.民事,車禍損害賠償,278000000000000.0,...,3,0,17,0,民國〈下同〉98年1月25日19時45分許，汪國全騎乘本人所有981－CBS號機車，在臺南市...,0,0,0,1,0
3,3,9800152,2009/3/23,08:55,2009/3/31,15:00,獨任,B.刑事,車禍傷害糾紛,341000000000000.0,...,3,0,7,0,民國〈下同〉98年2月12日9時49分許，謝天智駕駛OOO所有N8－0250號自小客車，在臺...,0,0,0,0,1
4,4,9800153,2009/3/23,08:59,2009/4/16,15:00,,A.民事,財務糾紛,296000000000000.0,...,2,0,19,0,雙方因合夥投資，辦理企業貸款及退股爭議申請調解，經調解結果如下：,0,0,0,1,0


In [None]:
train_df.info()

In [None]:
x = train_df.iloc[:, 22:].sum()

plt.figure(figsize=(8, 4))
ax = sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Summary")
plt.ylabel('Occurrences', fontsize=12)
plt.xlabel('Results', fontsize=12)

rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height + 5, ha='center', va='bottom', s='{:.1f}'.format(abs(label)))
    
plt.show()

In [None]:
colormap = plt.cm.plasma
plt.figure(figsize=(10, 8))
plt.title('Correlation of features', y=1.05, size=14)
sns.heatmap(data.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
len(pd.unique(train_df['people'].fillna('no people').values)) - 1

In [None]:
new_df = train_df.apply(lambda row: row['是否成立'] == '成立' or row['是否成立'] == '不成立', axis=1)

In [None]:
df_new = train_df[new_df]
len(df_new)

In [None]:
result = pd.DataFrame()
people, ty, success, unsuc= [], [], [], []
for n in pd.unique(df_new['people'].values):
    for t in pd.unique(df_new['案件細節類型'].values):
        a = df_new[df_new['people'] == n]
        if a[a['案件細節類型'] == t]['是否成立'].empty:
            continue
        else:
            b = a[a['案件細節類型'] == t]['是否成立'].value_counts()
            people.append(n)
            ty.append(t)
            if len(b) == 2:
                success.append(b['成立'])
                unsuc.append(b['不成立'])
            else:
                try:
                    success.append(b['成立'])
                    unsuc.append('0')
                except:
                    success.append('0')
                    unsuc.append(b['不成立'])
print(len(people), len(ty), len(success), len(unsuc))
result['people'] = people
result['案件細節類型'] = ty
result['成立'] = success
result['不成立'] = unsuc

In [None]:
result.to_csv('results/success.csv', encoding='big5', index=False)

## Processing Clearning

In [6]:
def clean_text(text, remove_stopwords=False):
    # remove url
    text = re.sub(r"(https?:\/\/)*(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    
    # Special expressions
    text = re.sub(r'〈下同〉', '', text)
    text = re.sub(r'（車號：[a-zA-Z0-9－]*號）', '', text)
    text = re.sub(r'（[\' \']*）', '', text)
    text = re.sub(r'（下同）', '', text)
    text = re.sub(r'\\r\\n[0-9a-zA-Z\\r\\n]*', '', text)
    text = re.sub(r'(口)', '', text)
    
    text = re.sub(r',', '，', text)
    text = re.sub(r'\.+', '...', text)
    text = re.sub(r'\.{6}', '...', text)
    text = re.sub(r'…', '...', text)
    text = re.sub(r';', '；', text)
    text = re.sub(r'°', '。', text)
    text = re.sub(r'】', ']', text)
    text = re.sub(r'【', '[', text)
    text = re.sub(r'\)', '\）', text)
    text = re.sub(r'\(', '\（', text)
    text = re.sub(r'“', '"', text)
    text = re.sub(r' ', '', text)
    text = re.sub(r'”', '"', text)
    text = re.sub(r'～', '~', text)
    text = re.sub(r'·', '。', text)
    text = re.sub(r'!', '！', text)
    text = re.sub(r'—', '-', text)
    text = re.sub(r'》', '\）', text)
    text = re.sub(r'《', '\（', text)
    text = re.sub(r'\?', '\？', text)
    text = re.sub(r'。。。', '...', text)
    text = re.sub(r'。。。。。。', '...', text)
    text = re.sub(r':', '：', text)
    
    text = special_alpha_removal.sub('', text)
    
    return text

# regex to remove all Non-Alpha Numeric and space
special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

# regex to remove all Alpha Numeric and space
special_alpha_removal = re.compile(r'[a-z\d ]', re.IGNORECASE)

# regex to replace all numeric
replace_numbers = re.compile(r'\d+', re.IGNORECASE)

# regex to replace ###...
replace_sharp = re.compile(r'[#]+', re.IGNORECASE)

In [7]:
cleaned_train_comments = []
print('Processing data cleaning...')

for text in train_comments:
    cleaned_train_comments.append(clean_text(text))
    
train_df['cleaned_comments'] = cleaned_train_comments
print('Done!')

Processing data cleaning...
Done!


## Word segmentation analysis

In [8]:
word_dict = defaultdict(int)

for sentence in tqdm(train_df['調解內容與決議']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    for word in seg_list:
        word_dict[word] += 1
word_dict = sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)
print(len(word_dict))

  0%|                                                                                         | 0/6277 [00:00<?, ?it/s]Building prefix dict from C:\Users\CZJ\Desktop\dict.txt.big.txt ...
Loading model from cache C:\Users\CZJ\AppData\Local\Temp\jieba.u28d6904473ddf2ff282219392fdef7a7.cache
Loading model cost 1.274 seconds.
Prefix dict has been built succesfully.
100%|█████████████████████████████████████████████████████████████████████████████| 6277/6277 [00:09<00:00, 640.33it/s]


25386


In [9]:
cleaned_word_dict = defaultdict(int)

for sentence in tqdm(train_df['cleaned_comments']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    for word in seg_list:
        cleaned_word_dict[word] += 1
cleaned_word_dict = sorted(cleaned_word_dict.items(), key=operator.itemgetter(1), reverse=True)
print(len(cleaned_word_dict))

100%|█████████████████████████████████████████████████████████████████████████████| 6277/6277 [00:06<00:00, 972.16it/s]


17196


In [10]:
cut_sentences = []
for sentence in tqdm(train_df['cleaned_comments']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    cut_sentences.append(" ".join(seg_list))
train_df['cleaned_comments_cut'] = cut_sentences

100%|█████████████████████████████████████████████████████████████████████████████| 6277/6277 [00:06<00:00, 978.05it/s]


In [11]:
cut_train_sentences = train_df['cleaned_comments_cut']

## A quick view of vocabulary

In [12]:
chinese_list, sign_list, dig_english_list = [], [], []
for word, count in word_dict:
    for char in word:
        if char >= u'\u4E00' and char <= u"\u9FA5":
            chinese_list.append((word, count))
        elif (char >= u'\u0041' and char <= u'\u005A') or (char >= u'\u0061' and char <= u'\u007A') or (char >= u'\u0030' and char <= u'\u0039'):
            dig_english_list.append((word, count))
            break
        else:
            sign_list.append((word, count))
            break
sorted_dig_english_list = sorted(set(dig_english_list), key=lambda x: x[1], reverse=True)
sorted_sign_list = sorted(set(sign_list), key=lambda x: x[1], reverse=True)
sorted_chinese_list = sorted(set(chinese_list), key=lambda x: x[1], reverse=True)
print("chinese_word: ", len(sorted_chinese_list))
print("dig_english_word: ", len(sorted_dig_english_list))
print("sign_count: ", len(sorted_sign_list))
print(sorted_chinese_list[:10000], '\n\n', sorted_dig_english_list[:50], '\n\n', sorted_sign_list[:20])

chinese_word:  16892
dig_english_word:  8422
sign_count:  72
[('人', 16399), ('年', 10262), ('月', 10200), ('日', 8569), ('聲請', 8521), ('給付', 7559), ('對造', 7204), ('與', 6799), ('下同', 6500), ('同意', 6220), ('結果', 5878), ('如下', 5743), ('不', 5593), ('民國', 5307), ('解', 5175), ('請調', 5173), ('騎乘', 5158), ('號', 5154), ('調解', 5025), ('故聲', 5008), ('機車', 4939), ('在', 4602), ('時', 4590), ('車損', 4486), ('新台幣', 4466), ('臺南市', 4432), ('發生', 4338), ('含', 4126), ('車禍', 4118), ('兩造', 4052), ('北區', 3946), ('本', 3866), ('其餘', 3822), ('汽車', 3729), ('強制', 3696), ('一', 3675), ('責任保險', 3617), ('事件', 3609), ('二', 3572), ('共計', 3556), ('拋棄', 3539), ('民事', 3538), ('請求權', 3536), ('造人', 3460), ('費用', 3401), ('萬元', 3398), ('駕駛', 3337), ('對', 3283), ('請', 3268), ('他', 3086), ('及其', 3074), ('必要', 3058), ('一切', 3047), ('所有', 2988), ('經', 2850), ('三', 2840), ('慰問金', 2792), ('體傷', 2774), ('醫療費', 2764), ('重型', 2747), ('分許', 2719), ('之', 2573), ('追究', 2536), ('刑事責任', 2530), ('付清', 2456), ('路', 2388), ('整', 2181), ('車費', 205

In [13]:
chinese_list, sign_list, dig_english_list = [], [], []
for word, count in cleaned_word_dict:
    for char in word:
        if char >= u'\u4E00' and char <= u"\u9FA5":
            chinese_list.append((word, count))
        elif (char >= u'\u0041' and char <= u'\u005A') or (char >= u'\u0061' and char <= u'\u007A') or (char >= u'\u0030' and char <= u'\u0039'):
            dig_english_list.append((word, count))
            break
        else:
            sign_list.append((word, count))
            break
sorted_dig_english_list = sorted(set(dig_english_list), key=lambda x: x[1], reverse=True)
sorted_sign_list = sorted(set(sign_list), key=lambda x: x[1], reverse=True)
sorted_chinese_list = sorted(set(chinese_list), key=lambda x: x[1], reverse=True)
print("chinese_word: ", len(sorted_chinese_list))
print("dig_english_word: ", len(sorted_dig_english_list))
print("sign_count: ", len(sorted_sign_list))
print(sorted_chinese_list[:10000], '\n\n', sorted_dig_english_list[:50], '\n\n', sorted_sign_list[:20])

chinese_word:  17142
dig_english_word:  0
sign_count:  54
[('人', 14891), ('聲請', 8549), ('給付', 7556), ('對造', 7234), ('與', 6771), ('同意', 6220), ('結果', 5878), ('如下', 5743), ('不', 5617), ('年月日', 5336), ('民國', 5305), ('解', 5175), ('請調', 5173), ('騎乘', 5157), ('調解', 5025), ('故聲', 5009), ('機車', 4938), ('在', 4655), ('年月日時', 4443), ('車損', 4441), ('臺南市', 4432), ('新台幣', 4355), ('發生', 4340), ('含', 4144), ('車禍', 4118), ('兩造', 4052), ('北區', 3946), ('本', 3868), ('其餘', 3822), ('號', 3736), ('汽車', 3730), ('強制', 3696), ('責任保險', 3617), ('事件', 3609), ('一', 3602), ('二', 3571), ('共計', 3556), ('拋棄', 3539), ('民事', 3538), ('請求權', 3536), ('萬元', 3446), ('費用', 3401), ('造人', 3344), ('駕駛', 3338), ('對', 3250), ('請', 3231), ('他', 3086), ('及其', 3074), ('必要', 3058), ('一切', 3047), ('所有', 2988), ('經', 2840), ('三', 2813), ('慰問金', 2792), ('醫療費', 2764), ('重型', 2747), ('分許', 2662), ('追究', 2535), ('刑事責任', 2530), ('之', 2524), ('付清', 2457), ('體傷', 2337), ('整', 2184), ('車費', 2053), ('雙方', 2040), ('自小', 1942), ('於', 1865), ('路', 18

# Build Vocabulary

In [14]:
tokenizer = Tokenizer(num_words=20000, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')

In [15]:
print('Automatically train vocab & tokenizer...')
tokenizer.fit_on_texts(cut_train_sentences)

train_sequences = tokenizer.texts_to_sequences(cut_train_sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Automatically train vocab & tokenizer...
Found 17180 unique tokens


# Tf-idf Feature Extraction

## Word-Level

In [16]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 6),
    max_features=200000
)
word_vectorizer.fit(cut_train_sentences)
train_word_features = word_vectorizer.transform(cut_train_sentences)
print('Word vectorization process Done!')

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Word vectorization process Done!


In [17]:
train_tfidf_features = train_word_features.tocsr()

In [18]:
train_tfidf_features.shape

(6277, 200000)

## Temporary Veiw & Processing Col Features

In [19]:
cols = list(train_df.columns)

In [20]:
data_columns = np.array([cols[3]] + cols[5:13] + cols[14:21])
label_column = np.array([cols[13]])

In [21]:
train_df['結案時間'] = [replace_sharp.sub('99:99', str(text)) for text in train_df['結案時間'].fillna(' ').values]
train_df['收件時間'] = [replace_sharp.sub('99:99', str(text)) for text in train_df['收件時間'].fillna(' ').values]

In [22]:
new_col = []
for lin in train_df['對照人'].fillna(' ').values:
    temp = "000000"
    if temp in str(lin):
        new_col.append(lin)
    else:
        new_col.append("gb")

print(len(new_col))

6277


In [23]:
feature_data = []
for column in data_columns:
    try:
        le = LabelEncoder()
        le.fit(list(train_df[column].fillna(' ').values))
        feature_data.append(le.transform(list(train_df[column].fillna(' ').values)))
    except:
        print(column)
feature_data

[array([192, 616,  74, ...,  73,  79,  88], dtype=int64),
 array([46, 42, 50, ..., 13, 37, 37], dtype=int64),
 array([2, 2, 0, ..., 2, 2, 2], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([128, 224, 226, ..., 224, 128, 224], dtype=int64),
 array([182, 304, 142, ..., 133, 192, 182], dtype=int64),
 array([275, 645, 156, ..., 259, 327, 159], dtype=int64),
 array([3, 3, 3, ..., 1, 1, 1], dtype=int64),
 array([13, 18,  0, ...,  5,  0, 14], dtype=int64),
 array([   0,    0,    0, ...,   47, 1176,  532], dtype=int64),
 array([4, 4, 4, ..., 5, 5, 5], dtype=int64),
 array([1, 4, 4, ..., 1, 1, 1], dtype=int64),
 array([2, 4, 2, ..., 1, 1, 2], dtype=int64),
 array([0, 1, 0, ..., 0, 0, 0], dtype=int64),
 array([ 69, 200, 133, ...,  70,  70,  70], dtype=int64),
 array([0, 0, 0, ..., 1, 1, 1], dtype=int64)]

In [24]:
train_feature_sets = []
for line in zip(feature_data[0], feature_data[0], feature_data[1], feature_data[2], feature_data[3], feature_data[4], feature_data[5], 
                feature_data[6], feature_data[7], feature_data[8], feature_data[9], feature_data[10], feature_data[11], feature_data[12],
                feature_data[13], feature_data[14], feature_data[15]):
    train_feature_sets.append(line)
train_feature_sets = np.array(train_feature_sets)

# Model Training

## Logistic Regression Model with Tfidf Features (Multi-Lables)

In [42]:
kfold = KFold(n_splits=FOLD_COUNT, shuffle=False)
tfidf_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_tfidf_features)):
    print('## In fold {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    models = []
    for class_name in list_classes:
        print('Processing {} ...'.format(class_name))
        classifier = LogisticRegression(solver='sag', C=12.0)
        
        train_target = train_df[class_name][train_idx]
        
        classifier.fit(train_tfidf_features[train_idx], train_target)
        y_pred = classifier.predict(train_tfidf_features[train_idx])
        
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        
        val_target = train_df[class_name][test_idx]
        val_pred = classifier.predict(train_tfidf_features[test_idx])
        
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        predictions[class_name] = classifier.predict_proba(train_tfidf_features)[:, 1]
        
        models.append(classifier)
        
    tfidf_models.append(models)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

## In fold 1 ##
Processing 不受理 ...
Training accuracy is 0.9971676402903169
Validation accuracy is 0.9984076433121019
Processing 不成立 ...
Training accuracy is 0.9890246061249779
Validation accuracy is 0.7722929936305732
Processing 成立 ...
Training accuracy is 0.9883165161975571
Validation accuracy is 0.8710191082802548
Processing 當事人不到場 ...
Training accuracy is 0.9890246061249779
Validation accuracy is 0.8535031847133758
Processing 聲請人撤回 ...
Training accuracy is 0.974685785094707
Validation accuracy is 0.9570063694267515
## In fold 2 ##
Processing 不受理 ...
Training accuracy is 0.997344662772172
Validation accuracy is 0.9936305732484076
Processing 不成立 ...
Training accuracy is 0.9812356169233493
Validation accuracy is 0.8280254777070064
Processing 成立 ...
Training accuracy is 0.9759249424676933
Validation accuracy is 0.9872611464968153
Processing 當事人不到場 ...
Training accuracy is 0.9860152239334395
Validation accuracy is 0.856687898089172
Processing 聲請人撤回 ...
Training accuracy is 0.976278987431

In [43]:
labels_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_feature_sets)):
    print('## In fold {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    models = []
    for class_name in list_classes:
        print('Processing {} ...'.format(class_name))
        classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, 
                   max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001,verbose=0)
        train_target = train_df[class_name][train_idx]
        
        classifier.fit(train_feature_sets[train_idx], train_target)
        y_pred = classifier.predict(train_feature_sets[train_idx])
        
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        
        val_target = train_df[class_name][test_idx]
        val_pred = classifier.predict(train_feature_sets[test_idx])
        
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        predictions[class_name] = classifier.predict_proba(train_feature_sets)[:, 1]
        
        models.append(classifier)
        
    labels_models.append(models)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')
print('K-fold cross validation Done!')

## In fold 1 ##
Processing 不受理 ...
Training accuracy is 0.9964595503628961
Validation accuracy is 0.9984076433121019
Processing 不成立 ...
Training accuracy is 0.7696937511063905
Validation accuracy is 0.7197452229299363
Processing 成立 ...
Training accuracy is 0.8334218445742609
Validation accuracy is 0.643312101910828
Processing 當事人不到場 ...
Training accuracy is 0.8702425208001416
Validation accuracy is 0.856687898089172
Processing 聲請人撤回 ...
Training accuracy is 0.9433528058063374
Validation accuracy is 0.9570063694267515
## In fold 2 ##
Processing 不受理 ...
Training accuracy is 0.9969906178084617
Validation accuracy is 0.9936305732484076
Processing 不成立 ...
Training accuracy is 0.7663303239511418
Validation accuracy is 0.7388535031847133
Processing 成立 ...
Training accuracy is 0.8445742609311383
Validation accuracy is 0.6242038216560509
Processing 當事人不到場 ...
Training accuracy is 0.8727208355461143
Validation accuracy is 0.8439490445859873
Processing 聲請人撤回 ...
Training accuracy is 0.94494600814

# OOB (Out-of-Bag) Evaluation (Error ...)

In [45]:
print('Predicting training results...')

for i, model in enumerate(tfidf_models):
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    for j, class_name in enumerate(list_classes):
        predictions[class_name] = model[j].predict_proba(train_tfidf_features)[:, 1]
    
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
for i, model in enumerate(labels_models):
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    for j, class_name in enumerate(list_classes):
        predictions[class_name] = model[j].predict_proba(train_feature_sets)[:, 1]
        
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')

print('Done!')

Predicting training results...
Done!


In [46]:
result_list = ['results/Submission_file_{}.csv'.format(i) for i in range(0, FOLD_COUNT * 2)]
bagging(result_list, 'results/bagging.csv')
print('Bagging operation Done!')

Doing ensemble on
results/Submission_file_0.csv
results/Submission_file_1.csv
results/Submission_file_2.csv
results/Submission_file_3.csv
results/Submission_file_4.csv
results/Submission_file_5.csv
results/Submission_file_6.csv
results/Submission_file_7.csv
results/Submission_file_8.csv
results/Submission_file_9.csv
results/Submission_file_10.csv
results/Submission_file_11.csv
results/Submission_file_12.csv
results/Submission_file_13.csv
results/Submission_file_14.csv
results/Submission_file_15.csv
results/Submission_file_16.csv
results/Submission_file_17.csv
results/Submission_file_18.csv
results/Submission_file_19.csv
Bagging operation Done!


In [47]:
test_df = pd.read_csv('results/bagging.csv', encoding='big5')
result_label = test_df[list_classes]
results = result_label.idxmax(axis=1)

In [48]:
val_target = train_df['是否成立']
print('Validation accuracy is {}'.format(accuracy_score(results, val_target)))

Validation accuracy is 0.9308586904572248


In [52]:
print(len(results), len(val_target))

6277 6277


In [None]:
# Confusion Matrix
for 

## Multi-Classification

In [26]:
train_label = train_df['是否成立']

In [33]:
tfidf_models = []
kfold = StratifiedKFold(n_splits=FOLD_COUNT, shuffle=False)
multi_classifier_tfidf_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_tfidf_features, train_label)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(solver='sag', C=12.0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    train_target = train_label[train_idx]
    classifier.fit(train_tfidf_features[train_idx], train_target)
    y_pred = classifier.predict(train_tfidf_features[train_idx])

    print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))

    val_target = train_label[test_idx]
    val_pred = classifier.predict(train_tfidf_features[test_idx])

    print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        
    multi_classifier_tfidf_models.append(classifier)
    predictions = classifier.predict(train_tfidf_features)
        
    tfidf_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/multi/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

## In fold 1 ##
Training accuracy is 0.9729059677704976
Validation accuracy is 0.6285714285714286
## In fold 2 ##
Training accuracy is 0.9520184135977338
Validation accuracy is 0.7996820349761526
## In fold 3 ##
Training accuracy is 0.9504337050805453
Validation accuracy is 0.804140127388535
## In fold 4 ##
Training accuracy is 0.9506107275624004
Validation accuracy is 0.7547770700636943
## In fold 5 ##
Training accuracy is 0.9497256151531245
Validation accuracy is 0.767515923566879
## In fold 6 ##
Training accuracy is 0.9483094352982829
Validation accuracy is 0.7531847133757962
## In fold 7 ##
Training accuracy is 0.9500796601168349
Validation accuracy is 0.7834394904458599
## In fold 8 ##
Training accuracy is 0.9520438860378694
Validation accuracy is 0.7763578274760383
## In fold 9 ##
Training accuracy is 0.9527517253583436
Validation accuracy is 0.7348242811501597
## In fold 10 ##
Training accuracy is 0.9506282073969209
Validation accuracy is 0.792332268370607
K-fold cross validatio

In [34]:
label_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_feature_sets, train_label)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, 
                   max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001,verbose=0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    train_target = train_label[train_idx]

    classifier.fit(train_feature_sets[train_idx], train_target)
    y_pred = classifier.predict(train_feature_sets[train_idx])

    print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))

    val_target = train_label[test_idx]
    val_pred = classifier.predict(train_feature_sets[test_idx])

    print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        
    multi_classifier_tfidf_models.append(classifier)
    predictions = classifier.predict(train_feature_sets)
        
    label_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/multi/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

## In fold 1 ##
Training accuracy is 0.6966530901363556
Validation accuracy is 0.5825396825396826
## In fold 2 ##
Training accuracy is 0.6921033994334278
Validation accuracy is 0.5977742448330684
## In fold 3 ##
Training accuracy is 0.6999468932554435
Validation accuracy is 0.589171974522293
## In fold 4 ##
Training accuracy is 0.6925119490175252
Validation accuracy is 0.5971337579617835
## In fold 5 ##
Training accuracy is 0.6910957691626837
Validation accuracy is 0.6401273885350318
## In fold 6 ##
Training accuracy is 0.6818906001062135
Validation accuracy is 0.7436305732484076
## In fold 7 ##
Training accuracy is 0.6756948132412817
Validation accuracy is 0.7786624203821656
## In fold 8 ##
Training accuracy is 0.6745708724119625
Validation accuracy is 0.7811501597444089
## In fold 9 ##
Training accuracy is 0.6789948681649266
Validation accuracy is 0.6869009584664537
## In fold 10 ##
Training accuracy is 0.6751017519023181
Validation accuracy is 0.6693290734824281
K-fold cross validat

# ExtraTreesClassifier

In [None]:
accs = []
et_predictions = OrderedDict()
et_predictions['id'] = test_df['id']

for class_name in list_classes:
    train_target = train_df[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_score = np.mean(cross_val_score(classifier, train_tfidf_features, train_target, cv=10, scoring='roc_auc'))
    accs.append(cv_score)
    print('CV Score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_tfidf_features, train_target)
    et_predictions[class_name] = classifier.predict_proba(test_tfidf_features)[:, 1]
    
submission = pd.DataFrame.from_dict(et_predictions)
submission.to_csv('result/LR_Based/ExtraTreesClassifier_Submission.csv', index=False)

# Predictions (Optional)

In [None]:
for i, model in enumerate(tfidf_models):
    print('## In Model {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = test_df['id']
    
    for class_name in list_classes:
        predictions[class_name] = model.predict_proba(test_tfidf_features)[:, 1]
        print('Predict the proba for {} Done!'.format(class_name))
        print(predictions.keys())
    
    print(predictions.keys())
    submission = pd.DataFrame.from_dict(predictions)
    submission.to_csv('Logistic_Regression_Submission_{}.csv'.format(i), index=False)

# Result Ensemble

In [25]:
def bagging(arrs, path):
    print("Doing ensemble on")
    subs = []
    for arr in arrs:
        print(arr)
        subs.append(pd.read_csv(arr, encoding='big5'))
    
    for sub in subs[1:]:
        for c in list_classes:
            subs[0][c] += sub[c]
    
    for c in list_classes:
        subs[0][c] /= len(subs)
        
    subs[0].to_csv(path, index=False, encoding='big5')