In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
dataPath = '/content/gdrive/My Drive/labeling_cowork_folder/raw_data/'

In [0]:
import pandas as pd
import numpy as np
dataset = pd.read_csv(dataPath+'task1_trainset.csv', dtype=str)
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)

In [0]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
trainset, validset = train_test_split(dataset, test_size=0.1, random_state=42)

trainset.to_csv(dataPath+'trainset.csv',index=False)
validset.to_csv(dataPath+'validset.csv',index=False)

In [0]:
# Do the same things for test data
dataset = pd.read_csv(dataPath+'task1_public_testset.csv', dtype=str)
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)
dataset.to_csv('testset.csv',index=False)

In [0]:
# colab doesn't have all package in nltk, we need to download by ourselves.
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
from nltk.tokenize import word_tokenize
import re

In [0]:
def data_preprocess(rawdata, istrain=True):
  # 載入NLTK的停用字詞集
  from nltk.corpus import stopwords
  stop_words = set(stopwords.words('english'))

  # NLTK Stemming
  from nltk.stem import PorterStemmer
  ps = PorterStemmer()

  # NLTK Lemma
  from nltk.stem import WordNetLemmatizer
  wnl = WordNetLemmatizer()

  records = []
  for docidx, row in enumerate(rawdata.itertuples()):
    # print(row)
    sents = row.Abstract.split('$$$')

    if istrain:
      tasks = row._3.split(' ')
      
    sent_cnt = len(sents)

    for sentidx, sent in enumerate(sents):
      sentid = '{}_S{:>03}'.format(row.Id, sentidx+1)

      # 處理單字因為斷行被斷開
      sent = sent.replace('- ', '')
      # 斷詞改成用空格斷開
      terms = word_tokenize(sent)
      # terms = sent.split(' ')

      new_terms = []
      for term in terms:
        # 移除特殊符號
        import string
        # term = term.translate(str.maketrans('', '', string.punctuation))
        # 移除所有非文字
        term = re.sub('[^a-zA-Z]', ' ', term)

        # 轉小寫
        term = term.lower()

        # Stop Word
        if term in stop_words:
          continue
        # single char
        if len(term) == 1:
          continue
        
        # lemma
        term = wnl.lemmatize(term)
        # stem
        term = ps.stem(term)

        # pure number
        try:
          _ = int(term)
          continue
        except ValueError as err:
          pass

        new_terms.append(term)

      clean_sent = ' '.join(new_terms)

      if istrain:
        record = (row.Id, row.Abstract, row._3, sentid, clean_sent, tasks[sentidx])
      else:
        record = (row.Id, row.Abstract, sentid, clean_sent)

      records.append(record)

  return records

In [0]:
clean_trainset = data_preprocess(trainset)
clean_valset = data_preprocess(validset)
clean_testset = data_preprocess(dataset, istrain=False)

train_cols = ['Id', 'Abstract', 'Task1', 'Sentid', 'clean_abstract', 'task']
test_cols = ['Id', 'Abstract', 'Sentid', 'clean_abstract']
train = pd.DataFrame(clean_trainset, columns=train_cols)
valid = pd.DataFrame(clean_valset, columns=train_cols)
test = pd.DataFrame(clean_testset, columns=test_cols)

In [0]:
train.head()

Unnamed: 0,Id,Abstract,Task1,Sentid,clean_abstract,task
0,D05945,The Wasserstein metric or earth mover's distan...,BACKGROUND BACKGROUND BACKGROUND/OBJECTIVES ME...,D05945_S001,wasserstein metric earth mover s distanc emd ...,BACKGROUND
1,D05945,The Wasserstein metric or earth mover's distan...,BACKGROUND BACKGROUND BACKGROUND/OBJECTIVES ME...,D05945_S002,especi light increasingli complex data comput ...,BACKGROUND
2,D05945,The Wasserstein metric or earth mover's distan...,BACKGROUND BACKGROUND BACKGROUND/OBJECTIVES ME...,D05945_S003,inspir challeng varieti new approach optim tra...,BACKGROUND/OBJECTIVES
3,D05945,The Wasserstein metric or earth mover's distan...,BACKGROUND BACKGROUND BACKGROUND/OBJECTIVES ME...,D05945_S004,paper introduc benchmark discret optim transpo...,METHODS
4,D05945,The Wasserstein metric or earth mover's distan...,BACKGROUND BACKGROUND BACKGROUND/OBJECTIVES ME...,D05945_S005,consist varieti grayscal imag variou resolut c...,METHODS


In [0]:
# 輸出處理完的原始檔案
modelPath = '/content/gdrive/My Drive/labeling_cowork_folder/TFIDF/'
train.to_csv(modelPath + 'train_tfidf_aft_proc.csv', index=False)
valid.to_csv(modelPath + 'valid_tfidf_aft_proc.csv', index=False)
test.to_csv(modelPath + 'test_tfidf_aft_proc.csv', index=False)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_doc = pd.concat([train['clean_abstract'], valid['clean_abstract'], test['clean_abstract']])
vectorizer = TfidfVectorizer(min_df=2, max_features=1500)
vectorizer.fit(tfidf_doc.values.tolist())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1500,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [0]:
train_X = vectorizer.transform(train['clean_abstract'].values.tolist())
test_X = vectorizer.transform(test['clean_abstract'].values.tolist())
valid_X = vectorizer.transform(valid['clean_abstract'].values.tolist())

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [0]:
# train所有類別組合的模型
# clf = RandomForestClassifier(n_estimators=100, n_jobs=4)
# clf = clf.fit(train_X, train['task'])
# print(classification_report(train['task'], clf.predict(train_X)))
# print(classification_report(valid['task'], clf.predict(valid_X)))

In [0]:
# 分開train各別組合的模型 # 'BACKGROUND' 'OBJECTIVES' 'METHODS' 'RESULTS' 'CONCLUSIONS'
train_y_bgd = np.array([int('BACKGROUND' in task) for task in train['task'].values])
clf_bgd = RandomForestClassifier(n_estimators=100, n_jobs=4, class_weight={0: 1, 1: 3})
# clf_bgd = SVC()
clf_bgd.fit(train_X, train_y_bgd)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 3},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=4, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [0]:
print(classification_report(train_y_bgd, clf_bgd.predict(train_X)))
valid_y_bgd = np.array([int('BACKGROUND' in task) for task in valid['task'].values])
print(classification_report(valid_y_bgd, clf_bgd.predict(valid_X)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30181
           1       1.00      1.00      1.00     11910

    accuracy                           1.00     42091
   macro avg       1.00      1.00      1.00     42091
weighted avg       1.00      1.00      1.00     42091

              precision    recall  f1-score   support

           0       0.81      0.92      0.86      3333
           1       0.72      0.50      0.59      1443

    accuracy                           0.79      4776
   macro avg       0.77      0.71      0.72      4776
weighted avg       0.78      0.79      0.78      4776



In [0]:
train_y_obj = np.array([int('OBJECTIVES' in task) for task in train['task'].values])
clf_obj = RandomForestClassifier(n_estimators=100, n_jobs=4, class_weight={0: 1, 1: 2})  # 
# clf_obj = SVC()  # 
clf_obj.fit(train_X, train_y_obj)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 2},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=4, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [0]:
print(classification_report(train_y_obj, clf_obj.predict(train_X)))
valid_y_obj = np.array([int('OBJECTIVES' in task) for task in valid['task'].values])
print(classification_report(valid_y_obj, clf_obj.predict(valid_X)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     33699
           1       1.00      1.00      1.00      8392

    accuracy                           1.00     42091
   macro avg       1.00      1.00      1.00     42091
weighted avg       1.00      1.00      1.00     42091

              precision    recall  f1-score   support

           0       0.84      0.96      0.90      3839
           1       0.65      0.27      0.39       937

    accuracy                           0.83      4776
   macro avg       0.75      0.62      0.64      4776
weighted avg       0.81      0.83      0.80      4776



In [0]:
train_y_mtd = np.where(train['task'].values == 'METHODS', 1, 0)
clf_mtd = RandomForestClassifier(n_estimators=100, n_jobs=4, class_weight={0: 1, 1: 2})
clf_mtd.fit(train_X, train_y_mtd)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 2},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=4, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [0]:
print(classification_report(train_y_mtd, clf_mtd.predict(train_X)))
valid_y_mtd = np.where(valid['task'].values == 'METHODS', 1, 0)
print(classification_report(valid_y_mtd, clf_mtd.predict(valid_X)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     32660
           1       1.00      1.00      1.00      9431

    accuracy                           1.00     42091
   macro avg       1.00      1.00      1.00     42091
weighted avg       1.00      1.00      1.00     42091

              precision    recall  f1-score   support

           0       0.80      0.97      0.88      3736
           1       0.59      0.15      0.24      1040

    accuracy                           0.79      4776
   macro avg       0.70      0.56      0.56      4776
weighted avg       0.76      0.79      0.74      4776



In [0]:
train_y_rslt = np.where(train['task'].values == 'RESULTS', 1, 0)
clf_rslt = RandomForestClassifier(n_estimators=100, n_jobs=4, class_weight={0: 1, 1: 2})
clf_rslt.fit(train_X, train_y_rslt)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 2},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=4, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [0]:
print(classification_report(train_y_rslt, clf_rslt.predict(train_X)))
valid_y_rslt = np.where(valid['task'].values == 'RESULTS', 1, 0)
print(classification_report(valid_y_rslt, clf_rslt.predict(valid_X)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     35040
           1       1.00      1.00      1.00      7051

    accuracy                           1.00     42091
   macro avg       1.00      1.00      1.00     42091
weighted avg       1.00      1.00      1.00     42091

              precision    recall  f1-score   support

           0       0.86      0.98      0.91      4014
           1       0.56      0.16      0.24       762

    accuracy                           0.85      4776
   macro avg       0.71      0.57      0.58      4776
weighted avg       0.81      0.85      0.81      4776



In [0]:
train_y_cnclsn = np.where(train['task'].values == 'CONCLUSIONS', 1, 0)
clf_cnclsn = RandomForestClassifier(n_estimators=100, n_jobs=4, class_weight={0: 1, 1: 2})
clf_cnclsn.fit(train_X, train_y_cnclsn)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 2},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=4, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [0]:
print(classification_report(train_y_cnclsn, clf_cnclsn.predict(train_X)))
valid_y_cnclsn = np.where(valid['task'].values == 'CONCLUSIONS', 1, 0)
print(classification_report(valid_y_cnclsn, clf_cnclsn.predict(valid_X)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39724
           1       1.00      1.00      1.00      2367

    accuracy                           1.00     42091
   macro avg       1.00      1.00      1.00     42091
weighted avg       1.00      1.00      1.00     42091

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      4493
           1       0.50      0.01      0.03       283

    accuracy                           0.94      4776
   macro avg       0.72      0.51      0.50      4776
weighted avg       0.92      0.94      0.91      4776



In [0]:
class F1():
    def __init__(self):
        # self.threshold = 0.5
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0
        self.name = 'F1'

    def reset(self):
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0

    def update(self, predicts, groundTruth):
        # predicts = predicts > self.threshold
        # np.sum(x), np.sum(y), np.sum(x*y)
        # self.n_precision += torch.sum(predicts).data.item()
        # self.n_recall += torch.sum(groundTruth).data.item()
        # self.n_corrects += torch.sum(groundTruth.type(torch.uint8) * predicts).data.item()
        # # predicts = predicts > self.threshold
        # np.sum(x), np.sum(y), np.sum(x*y)
        self.n_precision += np.sum(predicts)
        self.n_recall += np.sum(groundTruth)
        self.n_corrects += np.sum(groundTruth * predicts)

    def get_score(self):
        recall = self.n_corrects / self.n_recall
        precision = self.n_corrects / (self.n_precision + 1e-20) #prevent divided by zero
        return 2 * (recall * precision) / (recall + precision + 1e-20)

    def print_score(self):
        score = self.get_score()
        return '{:.5f}'.format(score)


In [0]:
f1_score = F1()

In [0]:
pred_class1 = clf_bgd.predict(valid_X)
pred_class2 = clf_obj.predict(valid_X)
pred_class3 = clf_mtd.predict(valid_X)
pred_class4 = clf_rslt.predict(valid_X)
pred_class5 = clf_cnclsn.predict(valid_X)

In [0]:
predicts = []

for predict in zip(pred_class1, pred_class2, pred_class3, pred_class4, pred_class5):
    ans = list(predict)
    check_pred_class = sum(ans)
    if check_pred_class == 0:
        ans.append(1)
    else:
        ans.append(0)
    
    predicts.append(np.array(ans))
    


In [0]:
predicts[:3]

[array([0, 1, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1]),
 array([0, 0, 0, 0, 0, 1])]

In [0]:
label_dict = {'BACKGROUND':0, 'OBJECTIVES':1, 'METHODS':2, 'RESULTS':3, 'CONCLUSIONS':4, 'OTHERS':5}
groundTruth = []
for label in valid['task'].tolist():
    onehot = [0,0,0,0,0,0]
    for l in label.split('/'):
        onehot[label_dict[l]] = 1
    
    groundTruth.append(np.array(onehot))

In [0]:
groundTruth[:3]

[array([0, 1, 0, 0, 0, 0]),
 array([1, 0, 0, 0, 0, 0]),
 array([0, 0, 1, 0, 0, 0])]

In [0]:
f1_score = F1()
for i in range(0, len(groundTruth), 1):
    f1_score.update(predicts[i], groundTruth[i])

len(groundTruth), len(predicts)

(4776, 4776)

In [0]:
f1_score.get_score()

0.26994577846630513

In [0]:
x = np.array([1, 1, 0, 0, 0])
y = np.array([0, 1, 1, 1, 0])

np.sum(x), np.sum(y), np.sum(x*y)

(2, 3, 1)