In [25]:
# Japanese

In [1]:
# English translate to Japanese

import pandas as pd
import numpy as np

train_from_en = pd.read_csv("NTCIR-13_MedWeb_ja_from_en_amazon_training.csv")
test = pd.read_csv("NTCIR-13_MedWeb_ja_test.csv", na_values='NaN',keep_default_na=False)

# remove punctuations inside
import re
from zhon.hanzi import punctuation

def clean_text(df, text_field):
    
    df[text_field] = df[text_field].apply(lambda elem:re.sub(r'[{}]+'.format(punctuation),'',elem))
    
    return df

# update

train_from_en = clean_text(train_from_en,"Tweet")
test = clean_text(test,"Tweet")

tweet_train_from_en = list(train_from_en.Tweet)
tweet_test = list(test.Tweet)

# Add Japanese tokenizer
import nagisa

def tokenize_jp(doc):
    doc = nagisa.tagging(doc)
    return doc.words

# define labels
categories = ['Influenza','Diarrhea','Hayfever','Cough','Headache','Fever','Runnynose','Cold']

y_train = train_from_en[categories].replace({'n':0, 'p':+1})
y_test = test[categories].replace({'n':0, 'p':+1})

# BOW
from sklearn.feature_extraction.text import CountVectorizer
stop_words = ['!','0','1','2','3','4','6','8','9','?','、','。','〜','・','(',')',',','-','.','...','/']
vectorizer = CountVectorizer(tokenizer=tokenize_jp, stop_words=stop_words)
X_train = vectorizer.fit_transform(tweet_train_from_en).toarray()

feature = vectorizer.get_feature_names()

vectorizer_test = CountVectorizer(tokenizer=tokenize_jp,vocabulary=vectorizer.vocabulary_)
X_test = vectorizer_test.transform(tweet_test).toarray()

print("The vocabulary contains {} unique tokens".format(len(feature)))

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier


# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(C=10, solver='lbfgs', penalty='l2', max_iter=2500, class_weight='balanced'), n_jobs=-1)),
            ])
for category in categories:
    print('**Processing {} sysptoms...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(X_train, y_train[category].values)
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction)))
    print('Test precision is {}'.format(precision_score(y_test[category], prediction, average='binary')))
    print('Test recall is {}'.format(recall_score(y_test[category], prediction, average='binary')))
    print('Test f1-score is {}\n'.format(f1_score(y_test[category], prediction, average='binary')))

The vocabulary contains 2016 unique tokens
**Processing Influenza sysptoms...**
Test accuracy is 0.9671875
Test precision is 0.6153846153846154
Test recall is 0.3333333333333333
Test f1-score is 0.43243243243243246

**Processing Diarrhea sysptoms...**
Test accuracy is 0.9734375
Test precision is 0.8133333333333334
Test recall is 0.953125
Test f1-score is 0.8776978417266187

**Processing Hayfever sysptoms...**
Test accuracy is 0.928125
Test precision is 0.0
Test recall is 0.0
Test f1-score is 0.0

**Processing Cough sysptoms...**


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9765625
Test precision is 0.9012345679012346
Test recall is 0.9125
Test f1-score is 0.9068322981366459

**Processing Headache sysptoms...**
Test accuracy is 0.975
Test precision is 0.8426966292134831
Test recall is 0.974025974025974
Test f1-score is 0.9036144578313253

**Processing Fever sysptoms...**
Test accuracy is 0.8984375
Test precision is 0.6891891891891891
Test recall is 0.5483870967741935
Test f1-score is 0.6107784431137725

**Processing Runnynose sysptoms...**
Test accuracy is 0.934375
Test precision is 0.8461538461538461
Test recall is 0.8048780487804879
Test f1-score is 0.8250000000000001

**Processing Cold sysptoms...**
Test accuracy is 0.9359375
Test precision is 0.7333333333333333
Test recall is 0.8555555555555555
Test f1-score is 0.7897435897435897



In [4]:
np.set_printoptions(threshold=np.inf) 

In [8]:
# Chinese translate to Japanese
train_from_zh = pd.read_csv("NTCIR-13_MedWeb_ja_from_zh_amazon_training.csv")

# update
train_from_zh = clean_text(train_from_zh,"Tweet")
tweet_train_from_zh = list(train_from_zh.Tweet)

from sklearn.feature_extraction.text import CountVectorizer
stop_words = ['!','0','1','2','3','4','6','8','9','?','、','。','〜','・','(',')',',','-','.','...','/']
vectorizer = CountVectorizer(tokenizer=tokenize_jp, stop_words=stop_words)
X_train = vectorizer.fit_transform(tweet_train_from_zh).toarray()

feature = vectorizer.get_feature_names()

vectorizer_test = CountVectorizer(tokenizer=tokenize_jp,vocabulary=vectorizer.vocabulary_)
X_test = vectorizer_test.transform(tweet_test).toarray()

print("The vocabulary contains {} unique tokens".format(len(feature)))

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier


# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(C=10, solver='lbfgs', penalty='l2', max_iter=2500, class_weight='balanced'), n_jobs=-1)),
            ])
for category in categories:
    print('**Processing {} sysptoms...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(X_train, y_train[category].values)
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction)))
    print('Test precision is {}'.format(precision_score(y_test[category], prediction, average='binary')))
    print('Test recall is {}'.format(recall_score(y_test[category], prediction, average='binary')))
    print('Test f1-score is {}\n'.format(f1_score(y_test[category], prediction, average='binary')))

The vocabulary contains 2052 unique tokens
**Processing Influenza sysptoms...**
Test accuracy is 0.95625
Test precision is 0.4
Test recall is 0.3333333333333333
Test f1-score is 0.3636363636363636

**Processing Diarrhea sysptoms...**
Test accuracy is 0.9703125
Test precision is 0.8
Test recall is 0.9375
Test f1-score is 0.8633093525179856

**Processing Hayfever sysptoms...**
Test accuracy is 0.978125
Test precision is 0.7758620689655172
Test recall is 0.9782608695652174
Test f1-score is 0.8653846153846154

**Processing Cough sysptoms...**
Test accuracy is 0.9828125
Test precision is 0.9156626506024096
Test recall is 0.95
Test f1-score is 0.9325153374233127

**Processing Headache sysptoms...**
Test accuracy is 0.971875
Test precision is 0.8241758241758241
Test recall is 0.974025974025974
Test f1-score is 0.8928571428571428

**Processing Fever sysptoms...**
Test accuracy is 0.871875
Test precision is 0.5555555555555556
Test recall is 0.5913978494623656
Test f1-score is 0.5729166666666666