# Packages

In [1]:
import pandas as pd
import numpy as np
import os, re, csv, codecs, operator, sys, gc
from collections import defaultdict, OrderedDict
from tqdm import tqdm
import jieba
import seaborn as sns
from collections import Counter
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, KFold, train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Parameters

In [3]:
jieba.set_dictionary('dict.txt.big.txt')
CLEAN_WORD_PATH = None
TRAIN_DATA_FILE = 'data_original.csv'
TEST_DATA_FILE = 'test.csv'
FOLD_COUNT = 10
list_classes = ['不受理', '不成立', '成立', '當事人不到場', '聲請人撤回']

# Data Overview

In [4]:
train_df = pd.read_csv(TRAIN_DATA_FILE)

FileNotFoundError: File b'data_original.csv' does not exist

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
cleaned_comments = []
temp = train_df['調解內容與決議'].fillna('no sentence').values
for line in temp:
    if '經調解結果如左：' in line:
        a = line.split('經調解結果如左：')[0]
    else:
        a = line.split('經調解結果如下：')[0]
    cleaned_comments.append(a)
train_df['調解內容與決議'] = cleaned_comments

In [None]:
train_df.corr()

In [None]:
print(np.any(train_df.isnull()) == True)

In [None]:
train_comments = train_df['調解內容與決議'].fillna('no comment').values
train_comments_lengths = [len(str(s)) for s in train_comments]

In [None]:
def explore_comments(arr):
    print("MAX LENGTH:\t\t", np.max(arr))
    print("AVG LENGTH:\t\t", np.average(arr))
    print("MIN LENGTH:\t\t", np.min(arr))
    print("STANDARD DIVISION:\t", np.std(arr))
    print("RANGE:\t\t\t", np.min(arr), " to ", np.average(arr) + 2 * np.std(arr))
    
print("------Train------")
explore_comments(train_comments_lengths)

In [None]:
print(train_df['是否成立'].value_counts())

In [None]:
sns.set()
pd.Series(train_comments_lengths).astype(int).hist()
plt.show()

# Data Cleaning

## Transfer Lable to One-hot

In [None]:
Score = train_df['是否成立']
data = pd.get_dummies(Score)
train_df = pd.concat([train_df, data], axis=1)
train_df.head()

In [None]:
train_df.info()

In [None]:
x = train_df.iloc[:, 22:].sum()

plt.figure(figsize=(8, 4))
ax = sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Summary")
plt.ylabel('Occurrences', fontsize=12)
plt.xlabel('Results', fontsize=12)

rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height + 5, ha='center', va='bottom', s='{:.1f}'.format(abs(label)))
    
plt.show()

In [None]:
colormap = plt.cm.plasma
plt.figure(figsize=(10, 8))
plt.title('Correlation of features', y=1.05, size=14)
sns.heatmap(data.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
len(pd.unique(train_df['people'].fillna('no people').values)) - 1

In [None]:
new_df = train_df.apply(lambda row: row['是否成立'] == '成立' or row['是否成立'] == '不成立', axis=1)

In [None]:
df_new = train_df[new_df]
len(df_new)

In [None]:
result = pd.DataFrame()
people, ty, success, unsuc= [], [], [], []
for n in pd.unique(df_new['people'].values):
    for t in pd.unique(df_new['案件細節類型'].values):
        a = df_new[df_new['people'] == n]
        if a[a['案件細節類型'] == t]['是否成立'].empty:
            continue
        else:
            b = a[a['案件細節類型'] == t]['是否成立'].value_counts()
            people.append(n)
            ty.append(t)
            if len(b) == 2:
                success.append(b['成立'])
                unsuc.append(b['不成立'])
            else:
                try:
                    success.append(b['成立'])
                    unsuc.append('0')
                except:
                    success.append('0')
                    unsuc.append(b['不成立'])
print(len(people), len(ty), len(success), len(unsuc))
result['people'] = people
result['案件細節類型'] = ty
result['成立'] = success
result['不成立'] = unsuc

In [None]:
result.to_csv('results/success.csv', encoding='big5', index=False)

## Processing Clearning

In [None]:
def clean_text(text, remove_stopwords=False):
    # remove url
    text = re.sub(r"(https?:\/\/)*(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    
    # Special expressions
    text = re.sub(r'〈下同〉', '', text)
    text = re.sub(r'（車號：[a-zA-Z0-9－]*號）', '', text)
    text = re.sub(r'（[\' \']*）', '', text)
    text = re.sub(r'（下同）', '', text)
    text = re.sub(r'\\r\\n[0-9a-zA-Z\\r\\n]*', '', text)
    text = re.sub(r'(口)', '', text)
    
    text = re.sub(r',', '，', text)
    text = re.sub(r'\.+', '...', text)
    text = re.sub(r'\.{6}', '...', text)
    text = re.sub(r'…', '...', text)
    text = re.sub(r';', '；', text)
    text = re.sub(r'°', '。', text)
    text = re.sub(r'】', ']', text)
    text = re.sub(r'【', '[', text)
    text = re.sub(r'\)', '\）', text)
    text = re.sub(r'\(', '\（', text)
    text = re.sub(r'“', '"', text)
    text = re.sub(r' ', '', text)
    text = re.sub(r'”', '"', text)
    text = re.sub(r'～', '~', text)
    text = re.sub(r'·', '。', text)
    text = re.sub(r'!', '！', text)
    text = re.sub(r'—', '-', text)
    text = re.sub(r'》', '\）', text)
    text = re.sub(r'《', '\（', text)
    text = re.sub(r'\?', '\？', text)
    text = re.sub(r'。。。', '...', text)
    text = re.sub(r'。。。。。。', '...', text)
    text = re.sub(r':', '：', text)
    
    text = special_alpha_removal.sub('', text)
    
    return text

# regex to remove all Non-Alpha Numeric and space
special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

# regex to remove all Alpha Numeric and space
special_alpha_removal = re.compile(r'[a-z\d ]', re.IGNORECASE)

# regex to replace all numeric
replace_numbers = re.compile(r'\d+', re.IGNORECASE)

# regex to replace ###...
replace_sharp = re.compile(r'[#]+', re.IGNORECASE)

In [None]:
cleaned_train_comments = []
print('Processing data cleaning...')

for text in train_comments:
    cleaned_train_comments.append(clean_text(text))
    
train_df['cleaned_comments'] = cleaned_train_comments
print('Done!')

## Word segmentation analysis

In [None]:
word_dict = defaultdict(int)

for sentence in tqdm(train_df['調解內容與決議']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    for word in seg_list:
        word_dict[word] += 1
word_dict = sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)
print(len(word_dict))

In [None]:
cleaned_word_dict = defaultdict(int)

for sentence in tqdm(train_df['cleaned_comments']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    for word in seg_list:
        cleaned_word_dict[word] += 1
cleaned_word_dict = sorted(cleaned_word_dict.items(), key=operator.itemgetter(1), reverse=True)
print(len(cleaned_word_dict))

In [None]:
cut_sentences = []
for sentence in tqdm(train_df['cleaned_comments']):
    seg_list = jieba.cut(str(sentence), cut_all=False)
    cut_sentences.append(" ".join(seg_list))
train_df['cleaned_comments_cut'] = cut_sentences

In [None]:
cut_train_sentences = train_df['cleaned_comments_cut']

In [None]:
cut_train_sentences = train_df['cleaned_comments']

## A quick view of vocabulary

In [None]:
chinese_list, sign_list, dig_english_list = [], [], []
for word, count in word_dict:
    for char in word:
        if char >= u'\u4E00' and char <= u"\u9FA5":
            chinese_list.append((word, count))
        elif (char >= u'\u0041' and char <= u'\u005A') or (char >= u'\u0061' and char <= u'\u007A') or (char >= u'\u0030' and char <= u'\u0039'):
            dig_english_list.append((word, count))
            break
        else:
            sign_list.append((word, count))
            break
sorted_dig_english_list = sorted(set(dig_english_list), key=lambda x: x[1], reverse=True)
sorted_sign_list = sorted(set(sign_list), key=lambda x: x[1], reverse=True)
sorted_chinese_list = sorted(set(chinese_list), key=lambda x: x[1], reverse=True)
print("chinese_word: ", len(sorted_chinese_list))
print("dig_english_word: ", len(sorted_dig_english_list))
print("sign_count: ", len(sorted_sign_list))
print(sorted_chinese_list[:10000], '\n\n', sorted_dig_english_list[:50], '\n\n', sorted_sign_list[:20])

In [None]:
chinese_list, sign_list, dig_english_list = [], [], []
for word, count in cleaned_word_dict:
    for char in word:
        if char >= u'\u4E00' and char <= u"\u9FA5":
            chinese_list.append((word, count))
        elif (char >= u'\u0041' and char <= u'\u005A') or (char >= u'\u0061' and char <= u'\u007A') or (char >= u'\u0030' and char <= u'\u0039'):
            dig_english_list.append((word, count))
            break
        else:
            sign_list.append((word, count))
            break
sorted_dig_english_list = sorted(set(dig_english_list), key=lambda x: x[1], reverse=True)
sorted_sign_list = sorted(set(sign_list), key=lambda x: x[1], reverse=True)
sorted_chinese_list = sorted(set(chinese_list), key=lambda x: x[1], reverse=True)
print("chinese_word: ", len(sorted_chinese_list))
print("dig_english_word: ", len(sorted_dig_english_list))
print("sign_count: ", len(sorted_sign_list))
print(sorted_chinese_list[:10000], '\n\n', sorted_dig_english_list[:50], '\n\n', sorted_sign_list[:20])

# Build Vocabulary

In [None]:
tokenizer = Tokenizer(num_words=20000, filters='"#$%&()+,-./:;<=>@[\\]^_`{|}~\t\n')

In [None]:
print('Automatically train vocab & tokenizer...')
tokenizer.fit_on_texts(cut_train_sentences)

train_sequences = tokenizer.texts_to_sequences(cut_train_sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

# Tf-idf Feature Extraction

## Word-Level

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 6),
    max_features=200000
)
word_vectorizer.fit(cut_train_sentences)
train_word_features = word_vectorizer.transform(cut_train_sentences)
print('Word vectorization process Done!')

In [None]:
train_tfidf_features = train_word_features.tocsr()

In [None]:
train_tfidf_features.shape

## Temporary Veiw & Processing Col Features

In [None]:
cols = list(train_df.columns)

In [None]:
data_columns = np.array([cols[3]] + cols[5:13] + cols[14:21])
label_column = np.array([cols[13]])

In [None]:
train_df['結案時間'] = [replace_sharp.sub('99:99', str(text)) for text in train_df['結案時間'].fillna(' ').values]
train_df['收件時間'] = [replace_sharp.sub('99:99', str(text)) for text in train_df['收件時間'].fillna(' ').values]

In [None]:
new_col = []
for lin in train_df['對照人'].fillna(' ').values:
    temp = "000000"
    if temp in str(lin):
        new_col.append(lin)
    else:
        new_col.append("gb")

print(len(new_col))

In [None]:
feature_data = []
for column in data_columns:
    try:
        le = LabelEncoder()
        le.fit(list(train_df[column].fillna(' ').values))
        feature_data.append(le.transform(list(train_df[column].fillna(' ').values)))
    except:
        print(column)
feature_data

In [None]:
train_feature_sets = []
for line in zip(feature_data[0], feature_data[0], feature_data[1], feature_data[2], feature_data[3], feature_data[4], feature_data[5], 
                feature_data[6], feature_data[7], feature_data[8], feature_data[9], feature_data[10], feature_data[11], feature_data[12],
                feature_data[13], feature_data[14], feature_data[15]):
    train_feature_sets.append(line)
train_feature_sets = np.array(train_feature_sets)

# Model Training

## Logistic Regression Model with Tfidf Features (Multi-Lables)

In [None]:
kfold = KFold(n_splits=FOLD_COUNT, shuffle=False)
tfidf_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_tfidf_features)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(solver='sag', C=12.0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    for class_name in list_classes:
        print('Processing {} ...'.format(class_name))
        train_target = train_df[class_name][train_idx]
        
        classifier.fit(train_tfidf_features[train_idx], train_target)
        y_pred = classifier.predict(train_tfidf_features[train_idx])
        
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        
        val_target = train_df[class_name][test_idx]
        val_pred = classifier.predict(train_tfidf_features[test_idx])
        
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        predictions[class_name] = classifier.predict_proba(train_tfidf_features)[:, 1]
        
    tfidf_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

In [None]:
labels_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_feature_sets)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, 
                   max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001,verbose=0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    for class_name in list_classes:
        print('Processing {} ...'.format(class_name))
        train_target = train_df[class_name][train_idx]
        
        classifier.fit(train_feature_sets[train_idx], train_target)
        y_pred = classifier.predict(train_feature_sets[train_idx])
        
        print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))
        
        val_target = train_df[class_name][test_idx]
        val_pred = classifier.predict(train_feature_sets[test_idx])
        
        print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        predictions[class_name] = classifier.predict_proba(train_feature_sets)[:, 1]
        
    labels_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')
print('K-fold cross validation Done!')

# OOB (Out-of-Bag) Evaluation (Error ...)

In [None]:
print('Predicting training results...')

for i, model in enumerate(tfidf_models):
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    for class_name in list_classes:
        predictions[class_name] = model.predict_proba(train_tfidf_features)[:, 1]
    
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
for i, model in enumerate(labels_models):
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    for class_name in list_classes:
        predictions[class_name] = model.predict_proba(train_feature_sets)[:, 1]
        
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')

print('Done!')

In [None]:
result_list = ['results/Submission_file_{}.csv'.format(i) for i in range(10, FOLD_COUNT * 2)]
bagging(result_list, 'results/bagging.csv')
print('Bagging operation Done!')

In [None]:
test_df = pd.read_csv('results/bagging.csv', encoding='big5')
result_label = test_df[list_classes]
results = result_label.idxmax(axis=1)

In [None]:
val_target = train_df['是否成立']
print('Validation accuracy is {}'.format(accuracy_score(results, val_target)))

## Multi-Classification

In [None]:
train_label = train_df['是否成立']

In [None]:
kfold = StratifiedKFold(n_splits=FOLD_COUNT, shuffle=False)
multi_classifier_tfidf_models = []
for i, (train_idx, test_idx) in enumerate(kfold.split(train_tfidf_features, train_label)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(solver='sag', C=12.0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    train_target = train_label[train_idx]
    classifier.fit(train_tfidf_features[train_idx], train_target)
    y_pred = classifier.predict(train_tfidf_features[train_idx])

    print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))

    val_target = train_label[test_idx]
    val_pred = classifier.predict(train_tfidf_features[test_idx])

    print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        
    multi_classifier_tfidf_models.append(classifier)
    predictions = classifier.predict_proba(train_tfidf_features)[:, 1]
        
    tfidf_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/multi/Submission_file_{}.csv'.format(i), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

In [None]:
for i, (train_idx, test_idx) in enumerate(kfold.split(train_feature_sets, train_label)):
    print('## In fold {} ##'.format(i + 1))
    classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, 
                   max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001,verbose=0)
    predictions = OrderedDict()
    predictions['id'] = train_df['id']
    
    train_target = train_label[train_idx]

    classifier.fit(train_feature_sets[train_idx], train_target)
    y_pred = classifier.predict(train_feature_sets[train_idx])

    print('Training accuracy is {}'.format(accuracy_score(y_pred, train_target)))

    val_target = train_label[test_idx]
    val_pred = classifier.predict(train_feature_sets[test_idx])

    print('Validation accuracy is {}'.format(accuracy_score(val_pred, val_target)))
        
    multi_classifier_tfidf_models.append(classifier)
    predictions[class_name] = classifier.predict_proba(train_feature_sets)[:, 1]
        
    tfidf_models.append(classifier)
    train_predicts = pd.DataFrame.from_dict(predictions)
    train_predicts.to_csv('results/multi/Submission_file_{}.csv'.format(i + 10), index=False, encoding='big5')
    
print('K-fold cross validation Done!')

# ExtraTreesClassifier

In [None]:
accs = []
et_predictions = OrderedDict()
et_predictions['id'] = test_df['id']

for class_name in list_classes:
    train_target = train_df[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_score = np.mean(cross_val_score(classifier, train_tfidf_features, train_target, cv=10, scoring='roc_auc'))
    accs.append(cv_score)
    print('CV Score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_tfidf_features, train_target)
    et_predictions[class_name] = classifier.predict_proba(test_tfidf_features)[:, 1]
    
submission = pd.DataFrame.from_dict(et_predictions)
submission.to_csv('result/LR_Based/ExtraTreesClassifier_Submission.csv', index=False)

# Predictions (Optional)

In [None]:
for i, model in enumerate(tfidf_models):
    print('## In Model {} ##'.format(i + 1))
    predictions = OrderedDict()
    predictions['id'] = test_df['id']
    
    for class_name in list_classes:
        predictions[class_name] = model.predict_proba(test_tfidf_features)[:, 1]
        print('Predict the proba for {} Done!'.format(class_name))
        print(predictions.keys())
    
    print(predictions.keys())
    submission = pd.DataFrame.from_dict(predictions)
    submission.to_csv('Logistic_Regression_Submission_{}.csv'.format(i), index=False)

# Result Ensemble

In [None]:
def bagging(arrs, path):
    print("Doing ensemble on")
    subs = []
    for arr in arrs:
        print(arr)
        subs.append(pd.read_csv(arr, encoding='big5'))
    
    for sub in subs[1:]:
        for c in list_classes:
            subs[0][c] += sub[c]
    
    for c in list_classes:
        subs[0][c] /= len(subs)
        
    subs[0].to_csv(path, index=False, encoding='big5')