In [1]:
# Chinese translate to English

import pandas as pd
import numpy as np

train_from_zh = pd.read_csv("NTCIR-13_MedWeb_en_from_zh_amazon_training.csv")

# remove punctuation inside
import re
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df

# update
clean_train_from_zh = clean_text(train_from_zh,"Tweet")

# lemmatization
import spacy
from vectorizers import SpacyLemmatizer

import en_core_web_sm
nlp = en_core_web_sm.load()
lemmatizer = SpacyLemmatizer(nlp, join_str=' ', n_threads=1)
lemmas_train_from_zh = lemmatizer(list(clean_train_from_zh.Tweet)) 

# convert to list
tweet_train_from_zh = list(lemmas_train_from_zh) 

# add stopwords 
import nltk
from nltk.corpus import stopwords
stop = list(stopwords.words('english'))
stop.extend('haha ugghh ugh uh um oh ok okay boo damn god yu yike yen yay mikos mitsuru shirasu lol sigh 10 100 10000 1004 102 104 320000 968' .split())

# using BOWs
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words=set(stop))
X = vectorizer.fit_transform(tweet_train_from_zh).toarray()

# assign labels
categories = ['Influenza','Diarrhea','Hayfever','Cough','Headache','Fever','Runnynose','Cold']
y = train_from_zh[categories].replace({'n':0, 'p':+1})
y = y.values

# generate new file
import csv

src_file_name1 = "NTCIR-13_MedWeb_en_training.csv"
tran_file_name2 = "NTCIR-13_MedWeb_en_from_zh_amazon_training.csv"  # 源文件路径
gen_file_name = "error0.05_update_en_from_zh_training.csv"  # 目标文件路径

f1 = open(src_file_name1, 'r', encoding='UTF-8') 
f2 = open(tran_file_name2, 'r', encoding='UTF-8')
f = open(gen_file_name, 'w', newline='') 

rows = csv.reader(f1)
rows_tran = csv.reader(f2)
write = csv.writer(f)
write.writerow(["ID","Tweet","Influenza","Diarrhea","Hayfever","Cough","Headache","Fever","Runnynose","Cold"])

# count number of sentences replaced
change_num = 0

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(C=10, solver='lbfgs', penalty='l2', max_iter=3000, class_weight='balanced'), n_jobs=-1)),
    ])

mskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)
  
for train_index, test_index in mskf.split(X, y):
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    LogReg_pipeline.fit(X_train, y_train)
    prediction = LogReg_pipeline.predict_proba(X_test)
    print("One Batch Complete >>>")
    
    row = y_test.shape[0]
    col = y_test.shape[1]
    
    for r in range (0,row):
        
        # refresh
        total_error = 0
        for c in range (0,col):
            error = y_test[r,c]-prediction[r,c]
            abs_error = abs(error)
            total_error = total_error+abs_error
        # calculate average 
        ava_error = total_error/col
        
        # replace or keep sentence
        if ava_error>=0.05:
            change_num = change_num+1
            for index, item in enumerate(rows):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f1.seek(0)
                    break
        else:
            for index, item in enumerate(rows_tran):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f2.seek(0)
                    break
                

f.close()
print("\nThere are total {} sentences replaced by thier original data.".format(change_num))

One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>

There are total 738 sentences replaced by thier original data.


In [2]:
# Japanese translate to English

import pandas as pd
import numpy as np

train_from_ja = pd.read_csv("NTCIR-13_MedWeb_en_from_ja_amazon_training.csv")

# remove punctuation inside
import re
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df

# update
clean_train_from_ja = clean_text(train_from_ja,"Tweet")

# lemmatization
import spacy
from vectorizers import SpacyLemmatizer

import en_core_web_sm
nlp = en_core_web_sm.load()
lemmatizer = SpacyLemmatizer(nlp, join_str=' ', n_threads=1)
lemmas_train_from_ja = lemmatizer(list(clean_train_from_ja.Tweet)) 

# convert to list
tweet_train_from_ja = list(lemmas_train_from_ja) 

# add stopwords 
import nltk
from nltk.corpus import stopwords
stop = list(stopwords.words('english'))
stop.extend('haha ugghh ugh uh um oh ok okay boo damn god yu yike yen yay mikos mitsuru shirasu lol sigh 10 100 10000 1004 102 104 320000 968' .split())

# using BOWs
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words=set(stop))
X = vectorizer.fit_transform(tweet_train_from_ja).toarray()

# assign labels
categories = ['Influenza','Diarrhea','Hayfever','Cough','Headache','Fever','Runnynose','Cold']
y = train_from_ja[categories].replace({'n':0, 'p':+1})
y = y.values

# generate new file

import csv

src_file_name1 = "NTCIR-13_MedWeb_en_training.csv"
tran_file_name2 = "NTCIR-13_MedWeb_en_from_ja_amazon_training.csv"  # 源文件路径
gen_file_name = "error0.05_update_en_from_ja_training.csv"  # 目标文件路径



f1 = open(src_file_name1, 'r', encoding='UTF-8') 
f2 = open(tran_file_name2, 'r', encoding='UTF-8')
f = open(gen_file_name, 'w', newline='') 

rows = csv.reader(f1)
rows_tran = csv.reader(f2)
write = csv.writer(f)
write.writerow(["ID","Tweet","Influenza","Diarrhea","Hayfever","Cough","Headache","Fever","Runnynose","Cold"])

# count number of sentences replaced
change_num = 0

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(C=10, solver='lbfgs', penalty='l2', max_iter=3000, class_weight='balanced'), n_jobs=-1)),
    ])

mskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)
  
for train_index, test_index in mskf.split(X, y):
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    LogReg_pipeline.fit(X_train, y_train)
    prediction = LogReg_pipeline.predict_proba(X_test)
    print("One Batch Complete >>>")
    
    row = y_test.shape[0]
    col = y_test.shape[1]
    
    for r in range (0,row):
        
        # refresh
        total_error = 0
        for c in range (0,col):
            error = y_test[r,c]-prediction[r,c]
            abs_error = abs(error)
            total_error = total_error+abs_error
        # calculate average 
        ava_error = total_error/col
        
        # replace or keep sentence
        if ava_error>=0.05:
            change_num = change_num+1
            for index, item in enumerate(rows):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f1.seek(0)
                    break
        else:
            for index, item in enumerate(rows_tran):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f2.seek(0)
                    break
                

f.close()
print("\nThere are total {} sentences replaced by thier original data.".format(change_num))

One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>

There are total 677 sentences replaced by thier original data.


In [3]:
# English translate to Japanese

import pandas as pd
import numpy as np

train_from_en = pd.read_csv("NTCIR-13_MedWeb_ja_from_en_amazon_training.csv")

# remove punctuations inside
import re
from zhon.hanzi import punctuation

def clean_text(df, text_field):
    
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r'[{}]+'.format(punctuation),'',elem))
    
    return df

# update

train_from_en = clean_text(train_from_en,"Tweet")
tweet_train_from_en = list(train_from_en.Tweet)

# Add Japanese tokenizer
import nagisa

def tokenize_jp(doc):
    doc = nagisa.tagging(doc)
    return doc.words

# BOW
from sklearn.feature_extraction.text import CountVectorizer
stop_words = ['!','0','1','2','3','4','6','8','9','?','、','。','〜','・','(',')',',','-','.','...','/']
vectorizer = CountVectorizer(tokenizer=tokenize_jp, stop_words=stop_words)
X = vectorizer.fit_transform(tweet_train_from_en).toarray()

# assigned labels
categories = ['Influenza','Diarrhea','Hayfever','Cough','Headache','Fever','Runnynose','Cold']
y = train_from_en[categories].replace({'n':0, 'p':+1})
y = y.values

# count number of sentences replaced
change_num = 0

# generate new file

import csv

src_file_name1 = "NTCIR-13_MedWeb_ja_training.csv"
tran_file_name2 = "NTCIR-13_MedWeb_ja_from_en_amazon_training.csv"  # 源文件路径
gen_file_name = "error0.05_update_ja_from_en_training.csv"  # 目标文件路径

f1 = open(src_file_name1, 'r', encoding='utf-8') 
f2 = open(tran_file_name2, 'r', encoding='utf-8')
f = open(gen_file_name, 'w', newline='') 

rows = csv.reader(f1)
rows_tran = csv.reader(f2)
write = csv.writer(f)
write.writerow(["ID","Tweet","Influenza","Diarrhea","Hayfever","Cough","Headache","Fever","Runnynose","Cold"])

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(C=10, solver='lbfgs', penalty='l2', max_iter=3000, class_weight='balanced'), n_jobs=-1)),
    ])

mskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)
  
for train_index, test_index in mskf.split(X, y):
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    LogReg_pipeline.fit(X_train, y_train)
    prediction = LogReg_pipeline.predict_proba(X_test)
    print("One Batch Complete >>>")
    
    row = y_test.shape[0]
    col = y_test.shape[1]
    
    for r in range (0,row):
        
        # refresh
        total_error = 0
        for c in range (0,col):
            error = y_test[r,c]-prediction[r,c]
            abs_error = abs(error)
            total_error = total_error+abs_error
        # calculate average 
        ava_error = total_error/col
        
        # replace or keep sentence
        if ava_error>=0.05:
            change_num = change_num+1
            for index, item in enumerate(rows):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f1.seek(0)
                    break
        else:
            for index, item in enumerate(rows_tran):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f2.seek(0)
                    break
                

f.close()
print("\nThere are total {} sentences replaced by thier original data.".format(change_num))

One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>

There are total 655 sentences replaced by thier original data.


In [4]:
# Chinese translate to Japanese

import pandas as pd
import numpy as np

train_from_zh = pd.read_csv("NTCIR-13_MedWeb_ja_from_zh_amazon_training.csv")

# remove punctuations inside
import re
from zhon.hanzi import punctuation

def clean_text(df, text_field):
    
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r'[{}]+'.format(punctuation),'',elem))
    
    return df

# update

train_from_zh = clean_text(train_from_zh,"Tweet")
tweet_train_from_zh = list(train_from_zh.Tweet)

# Add Japanese tokenizer
import nagisa

def tokenize_jp(doc):
    doc = nagisa.tagging(doc)
    return doc.words

# BOW
from sklearn.feature_extraction.text import CountVectorizer
stop_words = ['!','0','1','2','3','4','6','8','9','?','、','。','〜','・','(',')',',','-','.','...','/']
vectorizer = CountVectorizer(tokenizer=tokenize_jp, stop_words=stop_words)
X = vectorizer.fit_transform(tweet_train_from_zh).toarray()

# assigned labels
categories = ['Influenza','Diarrhea','Hayfever','Cough','Headache','Fever','Runnynose','Cold']
y = train_from_zh[categories].replace({'n':0, 'p':+1})
y = y.values

# count number of sentences replaced
change_num = 0

# generate new file

import csv

src_file_name1 = "NTCIR-13_MedWeb_ja_training.csv"
tran_file_name2 = "NTCIR-13_MedWeb_ja_from_zh_amazon_training.csv"  # 源文件路径
gen_file_name = "error0.05_update_ja_from_zh_training.csv"  # 目标文件路径



f1 = open(src_file_name1, 'r', encoding='utf-8') 
f2 = open(tran_file_name2, 'r', encoding='utf-8')
f = open(gen_file_name, 'w', newline='') 

rows = csv.reader(f1)
rows_tran = csv.reader(f2)
write = csv.writer(f)
write.writerow(["ID","Tweet","Influenza","Diarrhea","Hayfever","Cough","Headache","Fever","Runnynose","Cold"])

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(C=10, solver='lbfgs', penalty='l2', max_iter=3000, class_weight='balanced'), n_jobs=-1)),
    ])

mskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)
  
for train_index, test_index in mskf.split(X, y):
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    LogReg_pipeline.fit(X_train, y_train)
    prediction = LogReg_pipeline.predict_proba(X_test)
    print("One Batch Complete >>>")
    
    row = y_test.shape[0]
    col = y_test.shape[1]
    
    for r in range (0,row):
        
        # refresh
        total_error = 0
        for c in range (0,col):
            error = y_test[r,c]-prediction[r,c]
            abs_error = abs(error)
            total_error = total_error+abs_error
        # calculate average 
        ava_error = total_error/col
        
        # replace or keep sentence
        if ava_error>=0.05:
            change_num = change_num+1
            for index, item in enumerate(rows):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f1.seek(0)
                    break
        else:
            for index, item in enumerate(rows_tran):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f2.seek(0)
                    break
                

f.close()
print("\nThere are total {} sentences replaced by thier original data.".format(change_num))

One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>

There are total 696 sentences replaced by thier original data.


In [5]:
# English translate to Chinese

import pandas as pd
import numpy as np

train_from_en = pd.read_csv("NTCIR-13_MedWeb_zh_from_en_amazon_training.csv")

# remove punctuations inside

import re
from zhon.hanzi import punctuation

def clean_text(df, text_field):
    
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r'[{}]+'.format(punctuation),'',elem))
    
    return df

# update

train_from_en = clean_text(train_from_en,"Tweet")
tweet_train_from_en = list(train_from_en.Tweet)

# import tokenizer

import jieba

def tokenize_zh(text):
    words = jieba.lcut(text)
    return words

# assign labels

categories = ['Influenza','Diarrhea','Hayfever','Cough','Headache','Fever','Runnynose','Cold']
y = train_from_en[categories].replace({'n':0, 'p':+1})
y = y.values

# BOW

from sklearn.feature_extraction.text import CountVectorizer
stop_words = ['1','100','3','32','36','38','39','40',
              'ok','了',' ',',','.','...']
vectorizer = CountVectorizer(tokenizer=tokenize_zh, stop_words=stop_words)
X = vectorizer.fit_transform(tweet_train_from_en).toarray()

# count number of sentences replaced
change_num = 0

# generate new file

import csv

src_file_name1 = "NTCIR-13_MedWeb_zh_training.csv"
tran_file_name2 = "NTCIR-13_MedWeb_zh_from_en_amazon_training.csv"  # 源文件路径
gen_file_name = "error0.05_update_zh_from_en_training.csv"  # 目标文件路径



f1 = open(src_file_name1, 'r', encoding='utf-8') 
f2 = open(tran_file_name2, 'r', encoding='utf-8')
f = open(gen_file_name, 'w', newline='') 

rows = csv.reader(f1)
rows_tran = csv.reader(f2)
write = csv.writer(f)
write.writerow(["ID","Tweet","Influenza","Diarrhea","Hayfever","Cough","Headache","Fever","Runnynose","Cold"])

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(C=10, solver='lbfgs', penalty='l2', max_iter=3000, class_weight='balanced'), n_jobs=-1)),
    ])

mskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)
  
for train_index, test_index in mskf.split(X, y):
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    LogReg_pipeline.fit(X_train, y_train)
    prediction = LogReg_pipeline.predict_proba(X_test)
    print("One Batch Complete >>>")
    
    row = y_test.shape[0]
    col = y_test.shape[1]
    
    for r in range (0,row):
        
        # refresh
        total_error = 0
        for c in range (0,col):
            error = y_test[r,c]-prediction[r,c]
            abs_error = abs(error)
            total_error = total_error+abs_error
        # calculate average 
        ava_error = total_error/col
        
        # replace or keep sentence
        if ava_error>=0.05:
            change_num = change_num+1
            for index, item in enumerate(rows):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f1.seek(0)
                    break
        else:
            for index, item in enumerate(rows_tran):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f2.seek(0)
                    break
                

f.close()
print("\nThere are total {} sentences replaced by thier original data.".format(change_num))

Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/xz/jrlb55l167x3c2jq466bx5fm0000gn/T/jieba.cache
Loading model from cache /var/folders/xz/jrlb55l167x3c2jq466bx5fm0000gn/T/jieba.cache
Loading model cost 0.849 seconds.
Loading model cost 0.849 seconds.
Prefix dict has been built successfully.
Prefix dict has been built successfully.


One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>

There are total 605 sentences replaced by thier original data.


In [6]:
# Japanese translate to Chinese

import pandas as pd
import numpy as np

train_from_ja = pd.read_csv("NTCIR-13_MedWeb_zh_from_ja_amazon_training.csv")

# remove punctuations inside

import re
from zhon.hanzi import punctuation

def clean_text(df, text_field):
    
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r'[{}]+'.format(punctuation),'',elem))
    
    return df

# update

train_from_ja = clean_text(train_from_ja,"Tweet")
tweet_train_from_ja = list(train_from_ja.Tweet)

# import tokenizer

import jieba

def tokenize_zh(text):
    words = jieba.lcut(text)
    return words

# assign labels

categories = ['Influenza','Diarrhea','Hayfever','Cough','Headache','Fever','Runnynose','Cold']
y = train_from_ja[categories].replace({'n':0, 'p':+1})
y = y.values

# BOW

from sklearn.feature_extraction.text import CountVectorizer
stop_words = ['1','100','3','32','36','38','39','40',
              'ok','了',' ',',','.','...']
vectorizer = CountVectorizer(tokenizer=tokenize_zh, stop_words=stop_words)
X = vectorizer.fit_transform(tweet_train_from_ja).toarray()

# count number of sentences replaced
change_num = 0

# generate new file

import csv

src_file_name1 = "NTCIR-13_MedWeb_zh_training.csv"
tran_file_name2 = "NTCIR-13_MedWeb_zh_from_ja_amazon_training.csv"  # 源文件路径
gen_file_name = "error0.05_update_zh_from_ja_training.csv"  # 目标文件路径



f1 = open(src_file_name1, 'r', encoding='utf-8') 
f2 = open(tran_file_name2, 'r', encoding='utf-8')
f = open(gen_file_name, 'w', newline='') 

rows = csv.reader(f1)
rows_tran = csv.reader(f2)
write = csv.writer(f)
write.writerow(["ID","Tweet","Influenza","Diarrhea","Hayfever","Cough","Headache","Fever","Runnynose","Cold"])

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(C=10, solver='lbfgs', penalty='l2', max_iter=3000, class_weight='balanced'), n_jobs=-1)),
    ])

mskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)
  
for train_index, test_index in mskf.split(X, y):
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    LogReg_pipeline.fit(X_train, y_train)
    prediction = LogReg_pipeline.predict_proba(X_test)
    print("One Batch Complete >>>")
    
    row = y_test.shape[0]
    col = y_test.shape[1]
    
    for r in range (0,row):
        
        # refresh
        total_error = 0
        for c in range (0,col):
            error = y_test[r,c]-prediction[r,c]
            abs_error = abs(error)
            total_error = total_error+abs_error
        # calculate average 
        ava_error = total_error/col
        
        # replace or keep sentence
        if ava_error>=0.05:
            change_num = change_num+1
            for index, item in enumerate(rows):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f1.seek(0)
                    break
        else:
            for index, item in enumerate(rows_tran):
                if index == test_index[r]+1:
                    write.writerow(item)
                    f2.seek(0)
                    break
                

f.close()
print("\nThere are total {} sentences replaced by thier original data.".format(change_num))

One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>
One Batch Complete >>>

There are total 658 sentences replaced by thier original data.
