# Packages

In [210]:
import pandas as pd
import numpy as np
import os, re, csv, codecs, operator, sys, gc
from collections import defaultdict, OrderedDict
from tqdm import tqdm
import jieba
import seaborn as sns
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, KFold, train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer

# Parameters

In [2]:
jieba.set_dictionary('dict.txt.big.txt')
CLEAN_WORD_PATH = None
TRAIN_DATA_FILE = 'data_original.csv'
TEST_DATA_FILE = 'test.csv'
FOLD_COUNT = 10
list_classes = ['不受理', '不成立', '成立', '當事人不到場', '聲請人撤回']

# Data Overview

In [3]:
train_df = pd.read_csv(TRAIN_DATA_FILE)

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [4]:
cleaned_comments = []
temp = train_df['調解內容與決議'].fillna('no sentence').values
for line in temp:
    if '經調解結果如左：' in line:
        a = line.split('經調解結果如左：')[0]
    else:
        a = line.split('經調解結果如下：')[0]
    cleaned_comments.append(a)
train_df['調解內容與決議'] = cleaned_comments

In [None]:
train_df.corr()

In [None]:
print(np.any(train_df.isnull()) == True)

In [5]:
train_comments = train_df['調解內容與決議'].fillna('no comment').values
train_comments_lengths = [len(str(s)) for s in train_comments]

In [None]:
def explore_comments(arr):
    print("MAX LENGTH:\t\t", np.max(arr))
    print("AVG LENGTH:\t\t", np.average(arr))
    print("MIN LENGTH:\t\t", np.min(arr))
    print("STANDARD DIVISION:\t", np.std(arr))
    print("RANGE:\t\t\t", np.min(arr), " to ", np.average(arr) + 2 * np.std(arr))
    
print("------Train------")
explore_comments(train_comments_lengths)

In [None]:
print(train_df['是否成立'].value_counts())

In [None]:
sns.set()
pd.Series(train_comments_lengths).astype(int).hist()
plt.show()

# Data Cleaning

## Transfer Lable to One-hot

In [6]:
Score = train_df['是否成立']
data = pd.get_dummies(Score)
train_df = pd.concat([train_df, data], axis=1)

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
x = train_df.iloc[:, 22:].sum()

plt.figure(figsize=(8, 4))
ax = sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Summary")
plt.ylabel('Occurrences', fontsize=12)
plt.xlabel('Results', fontsize=12)

rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height + 5, ha='center', va='bottom', s='{:.1f}'.format(abs(label)))
    
plt.show()

In [None]:
colormap = plt.cm.plasma
plt.figure(figsize=(10, 8))
plt.title('Correlation of features', y=1.05, size=14)
sns.heatmap(data.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
len(pd.unique(train_df['people'].fillna('no people').values)) - 1

In [None]:
new_df = train_df.apply(lambda row: row['是否成立'] == '成立' or row['是否成立'] == '不成立', axis=1)

In [None]:
df_new = train_df[new_df]
len(df_new)

In [None]:
result = pd.DataFrame()
people, ty, success, unsuc= [], [], [], []
for n in pd.unique(df_new['people'].values):
    for t in pd.unique(df_new['案件細節類型'].values):
        a = df_new[df_new['people'] == n]
        if a[a['案件細節類型'] == t]['是否成立'].empty:
            continue
        else:
            b = a[a['案件細節類型'] == t]['是否成立'].value_counts()
            people.append(n)
            ty.append(t)
            if len(b) == 2:
                success.append(b['成立'])
                unsuc.append(b['不成立'])
            else:
                try:
                    success.append(b['成立'])
                    unsuc.append('0')
                except:
                    success.append('0')
                    unsuc.append(b['不成立'])
print(len(people), len(ty), len(success), len(unsuc))
result['people'] = people
result['案件細節類型'] = ty
result['成立'] = success
result['不成立'] = unsuc

In [None]:
result.to_csv('results/success.csv', encoding='big5', index=False)

## Processing Clearning

In [7]:
def clean_text(text, remove_stopwords=False):
    # remove url
    text = re.sub(r"(https?:\/\/)*(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    
    # Special expressions
    text = re.sub(r'〈下同〉', '', text)
    text = re.sub(r'（車號：[a-zA-Z0-9－]*號）', '', text)
    text = re.sub(r'（[\' \']*）', '', text)
    text = re.sub(r'（下同）', '', text)
    text = re.sub(r'\\r\\n[0-9a-zA-Z\\r\\n]*', '', text)
    text = re.sub(r'(口)', '', text)
    
    text = re.sub(r',', '，', text)
    text = re.sub(r'\.+', '...', text)
    text = re.sub(r'\.{6}', '...', text)
    text = re.sub(r'…', '...', text)
    text = re.sub(r';', '；', text)
    text = re.sub(r'°', '。', text)
    text = re.sub(r'】', ']', text)
    text = re.sub(r'【', '[', text)
    text = re.sub(r'\)', '\）', text)
    text = re.sub(r'\(', '\（', text)
    text = re.sub(r'“', '"', text)
    text = re.sub(r' ', '', text)
    text = re.sub(r'”', '"', text)
    text = re.sub(r'～', '~', text)
    text = re.sub(r'·', '。', text)
    text = re.sub(r'!', '！', text)
    text = re.sub(r'—', '-', text)
    text = re.sub(r'》', '\）', text)
    text = re.sub(r'《', '\（', text)
    text = re.sub(r'\?', '\？', text)
    text = re.sub(r'。。。', '...', text)
    text = re.sub(r'。。。。。。', '...', text)
    text = re.sub(r':', '：', text)
    
    text = special_alpha_removal.sub('', text)
    
    return text

# regex to remove all Non-Alpha Numeric and space
special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

# regex to remove all Alpha Numeric and space
special_alpha_removal = re.compile(r'[a-z\d ]', re.IGNORECASE)

# regex to replace all numeric
replace_numbers = re.compile(r'\d+', re.IGNORECASE)

# regex to replace ###...
replace_sharp = re.compile(r'[#]+', re.IGNORECASE)

In [8]:
cleaned_train_comments = []
print('Processing data cleaning...')

for text in train_comments:
    cleaned_train_comments.append(clean_text(text))
    
train_df['cleaned_comments'] = cleaned_train_comments
print('Done!')

Processing data cleaning...
Done!


## Word segmentation analysis

In [9]:
word_dict = defaultdict(int)

for sentence in tqdm(train_df['調解內容與決議']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    for word in seg_list:
        word_dict[word] += 1
word_dict = sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)
print(len(word_dict))

  0%|          | 0/6277 [00:00<?, ?it/s]Building prefix dict from /home/eternal/下載/Competition/CL_HW/dict.txt.big.txt ...
Loading model from cache /tmp/jieba.u3a0a53d5fde3fc6cdb96c613fa12b168.cache
Loading model cost 1.034 seconds.
Prefix dict has been built succesfully.
100%|██████████| 6277/6277 [00:05<00:00, 1088.82it/s]

24176





In [10]:
cleaned_word_dict = defaultdict(int)

for sentence in tqdm(train_df['cleaned_comments']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    for word in seg_list:
        cleaned_word_dict[word] += 1
cleaned_word_dict = sorted(cleaned_word_dict.items(), key=operator.itemgetter(1), reverse=True)
print(len(cleaned_word_dict))

100%|██████████| 6277/6277 [00:03<00:00, 1628.16it/s]

16302





In [11]:
cut_sentences = []
for sentence in tqdm(train_df['cleaned_comments']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    cut_sentences.append(" ".join(seg_list))
train_df['cleaned_comments_cut'] = cut_sentences

100%|██████████| 6277/6277 [00:03<00:00, 1759.98it/s]


In [12]:
cut_sentences = train_df['cleaned_comments_cut']

## A quick view of vocabulary

In [None]:
chinese_list, sign_list, dig_english_list = [], [], []
for word, count in word_dict:
    for char in word:
        if char >= u'\u4E00' and char <= u"\u9FA5":
            chinese_list.append((word, count))
        elif (char >= u'\u0041' and char <= u'\u005A') or (char >= u'\u0061' and char <= u'\u007A') or (char >= u'\u0030' and char <= u'\u0039'):
            dig_english_list.append((word, count))
            break
        else:
            sign_list.append((word, count))
            break
sorted_dig_english_list = sorted(set(dig_english_list), key=lambda x: x[1], reverse=True)
sorted_sign_list = sorted(set(sign_list), key=lambda x: x[1], reverse=True)
sorted_chinese_list = sorted(set(chinese_list), key=lambda x: x[1], reverse=True)
print("chinese_word: ", len(sorted_chinese_list))
print("dig_english_word: ", len(sorted_dig_english_list))
print("sign_count: ", len(sorted_sign_list))
print(sorted_chinese_list[:10000], '\n\n', sorted_dig_english_list[:50], '\n\n', sorted_sign_list[:20])

In [None]:
chinese_list, sign_list, dig_english_list = [], [], []
for word, count in cleaned_word_dict:
    for char in word:
        if char >= u'\u4E00' and char <= u"\u9FA5":
            chinese_list.append((word, count))
        elif (char >= u'\u0041' and char <= u'\u005A') or (char >= u'\u0061' and char <= u'\u007A') or (char >= u'\u0030' and char <= u'\u0039'):
            dig_english_list.append((word, count))
            break
        else:
            sign_list.append((word, count))
            break
sorted_dig_english_list = sorted(set(dig_english_list), key=lambda x: x[1], reverse=True)
sorted_sign_list = sorted(set(sign_list), key=lambda x: x[1], reverse=True)
sorted_chinese_list = sorted(set(chinese_list), key=lambda x: x[1], reverse=True)
print("chinese_word: ", len(sorted_chinese_list))
print("dig_english_word: ", len(sorted_dig_english_list))
print("sign_count: ", len(sorted_sign_list))
print(sorted_chinese_list[:10000], '\n\n', sorted_dig_english_list[:50], '\n\n', sorted_sign_list[:20])

# Build Vocabulary

In [13]:
tokenizer = Tokenizer(num_words=20000, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')

In [14]:
print('Automatically train vocab & tokenizer...')
tokenizer.fit_on_texts(cut_sentences)

train_sequences = tokenizer.texts_to_sequences(cut_sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Automatically train vocab & tokenizer...
Found 16286 unique tokens


## Temporary Veiw & Processing Col Features

In [15]:
cols = list(train_df.columns)

In [16]:
data_columns = np.array([cols[3]] + cols[5:13] + cols[14:21])
label_column = np.array([cols[13]])

In [17]:
train_df['結案時間'] = [replace_sharp.sub('99:99', str(text)) for text in train_df['結案時間'].fillna(' ').values]
train_df['收件時間'] = [replace_sharp.sub('99:99', str(text)) for text in train_df['收件時間'].fillna(' ').values]

In [18]:
new_col = []
for lin in train_df['對照人'].fillna(' ').values:
    temp = "000000"
    if temp in str(lin):
        new_col.append(lin)
    else:
        new_col.append("gb")

print(len(new_col))

6277


In [19]:
feature_data = []
for column in data_columns:
    try:
        le = LabelEncoder()
        le.fit(list(train_df[column].fillna(' ').values))
        feature_data.append(le.transform(list(train_df[column].fillna(' ').values)))
    except:
        print(column)
feature_data

[array([192, 616,  74, ...,  73,  79,  88]),
 array([46, 42, 50, ..., 13, 37, 37]),
 array([2, 2, 0, ..., 2, 2, 2]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([128, 224, 226, ..., 224, 128, 224]),
 array([182, 304, 142, ..., 133, 192, 182]),
 array([275, 645, 156, ..., 259, 327, 159]),
 array([3, 3, 3, ..., 1, 1, 1]),
 array([13, 18,  0, ...,  5,  0, 14]),
 array([   0,    0,    0, ...,   47, 1176,  532]),
 array([4, 4, 4, ..., 5, 5, 5]),
 array([1, 4, 4, ..., 1, 1, 1]),
 array([2, 4, 2, ..., 1, 1, 2]),
 array([0, 1, 0, ..., 0, 0, 0]),
 array([ 69, 200, 133, ...,  70,  70,  70]),
 array([0, 0, 0, ..., 1, 1, 1])]

In [20]:
feature_sets = []
for line in zip(feature_data[0], feature_data[0], feature_data[1], feature_data[2], feature_data[3], feature_data[4], feature_data[5], 
                feature_data[6], feature_data[7], feature_data[8], feature_data[9], feature_data[10], feature_data[11], feature_data[12],
                feature_data[13], feature_data[14], feature_data[15]):
    feature_sets.append(line)
feature_sets = np.array(feature_sets)

## Train-Test Separate

In [256]:
train_data, test_data, train_feature_sets, test_feature_sets = train_test_split(train_df, feature_sets, test_size=0, shuffle=True)
len(train_data), len(test_data), train_feature_sets.shape, test_feature_sets.shape

(6277, 0, (6277, 17), (0, 17))

In [257]:
# test
train_data = train_df
train_feature_sets = feature_sets

In [258]:
train_sentences = train_data['cleaned_comments_cut']
# test_sentences = test_data['cleaned_comments_cut']

# Tf-idf Feature Extraction

## Word-Level

In [259]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3),
    max_features=20000
)
word_vectorizer.fit(cut_sentences)
train_word_features = word_vectorizer.transform(train_sentences)
# test_word_features = word_vectorizer.transform(test_sentences)
print('Word vectorization process Done!')

Word vectorization process Done!


In [260]:
train_tfidf_features = train_word_features.tocsr()
# test_tfidf_features = test_word_features.tocsr()
train_tfidf_features.shape

(6277, 20000)

In [261]:
# tfidf_features = vstack([train_tfidf_features, test_tfidf_features])
tfidf_features = train_tfidf_features
tfidf_features.shape

(6277, 20000)

# Model Training

## Logistic Regression Model with Tfidf Features (Multi-Lables)

In [262]:
kfold = KFold(n_splits=FOLD_COUNT, shuffle=False)
tfidf_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_tfidf_features)):
    print('## In fold {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    models = []
    for class_name in list_classes:
        print('Processing {} ...'.format(class_name))
        classifier = LogisticRegression(solver='sag', C=12.0)
        
        train_target = train_data[class_name].values[train_idx]
        
        classifier.fit(train_tfidf_features[train_idx], train_target)
        y_pred = classifier.predict(train_tfidf_features[train_idx])
        
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        
        val_target = train_data[class_name].values[test_idx]
        val_pred = classifier.predict(train_tfidf_features[test_idx])
        
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        predictions[class_name] = classifier.predict_proba(tfidf_features)[:, 1]
        
        models.append(classifier)
        
    tfidf_models.append(models)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

## In fold 1 ##
Processing 不受理 ...
Training accuracy is 0.9968135953266065
Validation accuracy is 0.9984076433121019
Processing 不成立 ...
Training accuracy is 0.958399716764029
Validation accuracy is 0.7038216560509554
Processing 成立 ...
Training accuracy is 0.9702602230483272
Validation accuracy is 0.6751592356687898
Processing 當事人不到場 ...
Training accuracy is 0.9674278633386441
Validation accuracy is 0.8423566878980892
Processing 聲請人撤回 ...
Training accuracy is 0.9638874137015401
Validation accuracy is 0.9585987261146497
## In fold 2 ##
Processing 不受理 ...
Training accuracy is 0.9971676402903169
Validation accuracy is 0.9936305732484076
Processing 不成立 ...
Training accuracy is 0.9534430872720836
Validation accuracy is 0.7229299363057324
Processing 成立 ...
Training accuracy is 0.9607010090281466
Validation accuracy is 0.697452229299363
Processing 當事人不到場 ...
Training accuracy is 0.9651265710745265
Validation accuracy is 0.8471337579617835
Processing 聲請人撤回 ...
Training accuracy is 0.96512657107

In [263]:
len(tfidf_models), len(tfidf_models[0])

(10, 5)

In [250]:
for i in tfidf_models[0]:
    print(i.coef_[0])

[-0.6555     -0.17816487 -0.12829888 ... -0.01202514 -0.00456259
 -0.03817875]
[-1.48510979 -1.11622148 -0.72304082 ...  0.4455174  -0.04212252
 -0.03370598]
[ 2.99570985  1.7158505   0.90395635 ... -0.23230583  0.04514713
  0.17993775]
[-2.3502621  -0.65767856 -0.23264265 ... -0.26944685 -0.02704333
  0.20204152]
[-0.91609046 -0.34169973  0.06669629 ... -0.07329201 -0.01767867
 -0.33254976]


In [251]:
labels_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_feature_sets)):
    print('## In fold {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    models = []
    for class_name in list_classes:
        print('Processing {} ...'.format(class_name))
        classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, 
                   max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001,verbose=0)
        
        train_target = train_data[class_name].values[train_idx]
        
        classifier.fit(train_feature_sets[train_idx], train_target)
        y_pred = classifier.predict(train_feature_sets[train_idx])
        
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        
        val_target = train_data[class_name].values[test_idx]
        val_pred = classifier.predict(train_feature_sets[test_idx])
        
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        predictions[class_name] = classifier.predict_proba(feature_sets)[:, 1]
        
        models.append(classifier)
        
    labels_models.append(models)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')
print('K-fold cross validation Done!')

## In fold 1 ##
Processing 不受理 ...
Training accuracy is 0.9964595503628961
Validation accuracy is 0.9984076433121019
Processing 不成立 ...
Training accuracy is 0.7682775712515489
Validation accuracy is 0.7197452229299363
Processing 成立 ...
Training accuracy is 0.8364312267657993
Validation accuracy is 0.6369426751592356
Processing 當事人不到場 ...
Training accuracy is 0.8707735882457072
Validation accuracy is 0.85828025477707
Processing 聲請人撤回 ...
Training accuracy is 0.9433528058063374
Validation accuracy is 0.9570063694267515
## In fold 2 ##
Processing 不受理 ...
Training accuracy is 0.9969906178084617
Validation accuracy is 0.9936305732484076
Processing 不成立 ...
Training accuracy is 0.7663303239511418
Validation accuracy is 0.7388535031847133
Processing 成立 ...
Training accuracy is 0.8474066206408214
Validation accuracy is 0.6178343949044586
Processing 當事人不到場 ...
Training accuracy is 0.8727208355461143
Validation accuracy is 0.8439490445859873
Processing 聲請人撤回 ...
Training accuracy is 0.94494600814

In [237]:
len(labels_models), len(labels_models[0])

(10, 5)

# OOB (Out-of-Bag) Evaluation (Error ...)

In [None]:
print('Predicting training results...')

for i, model in enumerate(tfidf_models):
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    for class_name in list_classes:
        predictions[class_name] = model.predict_proba(train_tfidf_features)[:, 1]
    
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
for i, model in enumerate(labels_models):
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    for class_name in list_classes:
        predictions[class_name] = model.predict_proba(train_feature_sets)[:, 1]
        
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')

print('Done!')

In [281]:
result_list = ['results/Submission_file_{}.csv'.format(i) for i in range(0, FOLD_COUNT * 2)]
bagging(result_list, 'results/bagging.csv')
print('Bagging operation Done!')

Doing ensemble on
results/Submission_file_0.csv
results/Submission_file_1.csv
results/Submission_file_2.csv
results/Submission_file_3.csv
results/Submission_file_4.csv
results/Submission_file_5.csv
results/Submission_file_6.csv
results/Submission_file_7.csv
results/Submission_file_8.csv
results/Submission_file_9.csv
results/Submission_file_10.csv
results/Submission_file_11.csv
results/Submission_file_12.csv
results/Submission_file_13.csv
results/Submission_file_14.csv
results/Submission_file_15.csv
results/Submission_file_16.csv
results/Submission_file_17.csv
results/Submission_file_18.csv
results/Submission_file_19.csv
Bagging operation Done!


In [282]:
test_df = pd.read_csv('results/bagging.csv', encoding='big5')
result_label = test_df[list_classes]
results = result_label.idxmax(axis=1)

In [283]:
val_target = train_df['是否成立']
# for i in range(20):
#     print(results[i], '\t', val_target[i])

In [284]:
print('Validation accuracy is {}'.format(accuracy_score(results, val_target)))
# print('Validation AUC is {}'.format(roc_auc_score(results, val_target)))

Validation accuracy is 0.8631511868727099


## Multi-Classification

In [121]:
label_dict = {'成立': 0, '不成立': 1, '當事人不到場': 2, '聲請人撤回': 3, '不受理':4}
reverse_dict = {idx: word for word, idx in label_dict.items()}
print(reverse_dict)

{0: '成立', 1: '不成立', 2: '當事人不到場', 3: '聲請人撤回', 4: '不受理'}


In [128]:
print(train_data['是否成立'].value_counts())
train_label = [label_dict.get(label, 1) for label in train_data['是否成立'].values]
train_label = np.array(train_label)

成立        2518
不成立       1049
當事人不到場     566
聲請人撤回      245
不受理         15
Name: 是否成立, dtype: int64


In [140]:
kfold = StratifiedKFold(n_splits=FOLD_COUNT, shuffle=False)
multi_classifier_tfidf_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_tfidf_features, train_label)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(solver='sag', C=12.0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    train_target = train_label[train_idx]
    classifier.fit(train_tfidf_features[train_idx], train_target)
    y_pred = classifier.predict(train_tfidf_features[train_idx])

    print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))

    val_target = train_label[test_idx]
    val_pred = classifier.predict(train_tfidf_features[test_idx])

    print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        
    multi_classifier_tfidf_models.append(classifier)
    predictions['target'] = [reverse_dict[idx] for idx in classifier.predict(tfidf_features)]
        
    tfidf_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/multi/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

## In fold 1 ##
Training accuracy is 0.9433198380566802
Validation accuracy is 0.7142857142857143
## In fold 2 ##
Training accuracy is 0.944078947368421
Validation accuracy is 0.6870748299319728
## In fold 3 ##
Training accuracy is 0.9483805668016194
Validation accuracy is 0.6689342403628118
## In fold 4 ##
Training accuracy is 0.9455971659919028
Validation accuracy is 0.6530612244897959
## In fold 5 ##
Training accuracy is 0.9458502024291497
Validation accuracy is 0.671201814058957
## In fold 6 ##
Training accuracy is 0.9456246838644411
Validation accuracy is 0.7084282460136674
## In fold 7 ##
Training accuracy is 0.9433628318584071
Validation accuracy is 0.6872146118721462
## In fold 8 ##
Training accuracy is 0.9451327433628318
Validation accuracy is 0.6894977168949772
## In fold 9 ##
Training accuracy is 0.9459049544994944
Validation accuracy is 0.6956521739130435
## In fold 10 ##
Training accuracy is 0.9441496082891079
Validation accuracy is 0.7018348623853211
K-fold cross validati

In [142]:
for i, (train_idx, test_idx) in enumerate(kfold.split(train_feature_sets, train_label)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, 
                   max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001,verbose=0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    train_target = train_label[train_idx]

    classifier.fit(train_feature_sets[train_idx], train_target)
    y_pred = classifier.predict(train_feature_sets[train_idx])

    print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))

    val_target = train_label[test_idx]
    val_pred = classifier.predict(train_feature_sets[test_idx])

    print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        
    multi_classifier_tfidf_models.append(classifier)
    predictions['target'] = [reverse_dict[idx] for idx in classifier.predict(feature_sets)]
        
    tfidf_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/multi/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

## In fold 1 ##
Training accuracy is 0.6826923076923077
Validation accuracy is 0.6984126984126984
## In fold 2 ##
Training accuracy is 0.6864878542510121
Validation accuracy is 0.6757369614512472
## In fold 3 ##
Training accuracy is 0.6826923076923077
Validation accuracy is 0.673469387755102
## In fold 4 ##
Training accuracy is 0.6895242914979757
Validation accuracy is 0.655328798185941
## In fold 5 ##
Training accuracy is 0.680414979757085
Validation accuracy is 0.7097505668934241
## In fold 6 ##
Training accuracy is 0.6853818917551846
Validation accuracy is 0.662870159453303
## In fold 7 ##
Training accuracy is 0.6844500632111251
Validation accuracy is 0.6757990867579908
## In fold 8 ##
Training accuracy is 0.6902654867256637
Validation accuracy is 0.6666666666666666
## In fold 9 ##
Training accuracy is 0.6830131445904954
Validation accuracy is 0.700228832951945
## In fold 10 ##
Training accuracy is 0.6863785696234521
Validation accuracy is 0.6903669724770642
K-fold cross validation 

# ExtraTreesClassifier

In [None]:
accs = []
et_predictions = OrderedDict()
et_predictions['id'] = test_df['id']

for class_name in list_classes:
    train_target = train_df[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_score = np.mean(cross_val_score(classifier, train_tfidf_features, train_target, cv=10, scoring='roc_auc'))
    accs.append(cv_score)
    print('CV Score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_tfidf_features, train_target)
    et_predictions[class_name] = classifier.predict_proba(test_tfidf_features)[:, 1]
    
submission = pd.DataFrame.from_dict(et_predictions)
submission.to_csv('result/LR_Based/ExtraTreesClassifier_Submission.csv', index=False)

# Predictions (Optional)

In [None]:
for i, model in enumerate(tfidf_models):
    print('## In Model {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = test_df['id']
    
    for class_name in list_classes:
        predictions[class_name] = model.predict_proba(test_tfidf_features)[:, 1]
        print('Predict the proba for {} Done!'.format(class_name))
        print(predictions.keys())
    
    print(predictions.keys())
    submission = pd.DataFrame.from_dict(predictions)
    submission.to_csv('Logistic_Regression_Submission_{}.csv'.format(i), index=False)

# Result Ensemble

In [36]:
def bagging(arrs, path):
    print("Doing ensemble on")
    subs = []
    for arr in arrs:
        print(arr)
        subs.append(pd.read_csv(arr, encoding='big5'))
    
    for sub in subs[1:]:
        for c in list_classes:
            subs[0][c] += sub[c]
    
    for c in list_classes:
        subs[0][c] /= len(subs)
        
    subs[0].to_csv(path, index=False, encoding='big5')