Skip to content

open-korean-text/open-korean-text-4clj

Repository files navigation

open-korean-text-4clj

Clojars Project Build Status

A Open Korean Text Processor wrapper for Clojure.

Dependencies

Get Started

Dependencies

Leiningen dependency in project.clj (from Clojars):

[open-korean-text-4clj "0.3"]

Maven dependency information in pom.xml:

<dependency>
  <groupId>open-korean-text-4clj</groupId>
  <artifactId>open-korean-text-4clj</artifactId>
  <version>0.3</version>
</dependency>

Usages

normalize

(normalize "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ")
;=> "한국어를 처리하는 예시입니다ㅋㅋㅋ"

tokenize

  • default operation
(tokenize "한국어를 처리하는 예시입니닼ㅋㅋ")
;=> [{:text "한국어", :pos :Noun, :offset 0, :length 3, :unknown false}
;    {:text "를", :pos :Josa, :offset 3, :length 1, :unknown false}
;    {:text "처리", :pos :Noun, :offset 5, :length 2, :unknown false}
;    {:text "하는", :pos :Verb, :offset 7, :length 2, :unknown false}
;    {:text "예시", :pos :Noun, :offset 10, :length 2, :unknown false}
;    {:text "입니", :pos :Adjective, :offset 12, :length 2, :unknown false}
;    {:text "닼", :pos :Noun, :offset 14, :length 1, :unknown true}
;    {:text "ㅋㅋ", :pos :KoreanParticle, :offset 15, :length 2, :unknown false}]
;   nil
  • with normalization
(tokenize "한국어를 처리하는 예시입니닼ㅋㅋ" :norm true)
;=> [{:text "한국어", :pos :Noun, :offset 0, :length 3, :unknown false}
;    {:text "를", :pos :Josa, :offset 3, :length 1, :unknown false}
;    {:text "처리", :pos :Noun, :offset 5, :length 2, :unknown false}
;    {:text "하는", :pos :Verb, :offset 7, :length 2, :unknown false}
;    {:text "예시", :pos :Noun, :offset 10, :length 2, :unknown false}
;    {:text "입니다", :pos :Adjective, :offset 12, :length 3, :unknown false}
;    {:text "ㅋㅋ", :pos :KoreanParticle, :offset 15, :length 2, :unknown false}]
;   nil
  • with normalization & stemming
(tokenize "한국어를 처리하는 예시입니닼ㅋㅋ" :norm true :stem true)
;=> [{:text "한국어", :pos :Noun, :offset 0, :length 3, :unknown false}
;    {:text "를", :pos :Josa, :offset 3, :length 1, :unknown false}
;    {:text "처리", :pos :Noun, :offset 5, :length 2, :unknown false}
;    {:text "하다", :pos :Verb, :offset 7, :length 2, :unknown false}
;    {:text "예시", :pos :Noun, :offset 10, :length 2, :unknown false}
;    {:text "이다", :pos :Adjective, :offset 12, :length 3, :unknown false}
;    {:text "ㅋㅋ", :pos :KoreanParticle, :offset 15, :length 2, :unknown false}]
;   nil
  • as-strs (return texts only)
(tokenize "한국어를 처리하는 예시입니닼ㅋㅋ" :as-strs true)
;=> ["한국어" "를" "처리" "하는" "예시" "입니" "닼" "ㅋㅋ"]

(tokenize "한국어를 처리하는 예시입니닼ㅋㅋ" :as-strs true :norm true)
;=> ["한국어" "를" "처리" "하는" "예시" "입니다" "ㅋㅋ"]

(tokenize "한국어를 처리하는 예시입니닼ㅋㅋ" :as-strs true :norm true :stem true)
;=> ["한국어" "를" "처리" "하다" "예시" "이다" "ㅋㅋ"]

tokenize-top-n

(tokenize-top-n "대선 후보" 3)
;=> ([[{:text "대선", :pos :Noun, :offset 0, :length 2, :unknown false}]
;     [{:text "대", :pos :Modifier, :offset 0, :length 1, :unknown false}
;      {:text "선", :pos :Noun, :offset 1, :length 1, :unknown false}]
;     [{:text "대", :pos :Verb, :offset 0, :length 1, :unknown false}
;      {:text "선", :pos :Noun, :offset 1, :length 1, :unknown false}]]
;    [[{:text "후보", :pos :Noun, :offset 3, :length 2, :unknown false}]
;     [{:text "후보", :pos :Noun, :offset 3, :length 2, :unknown true}]
;     [{:text "후", :pos :Noun, :offset 3, :length 1, :unknown false}
;      {:text "보", :pos :Verb, :offset 4, :length 1, :unknown false}]])
;   nil

detokenize

(detokenize ["연세", "대학교", "보건", "대학원","","오신","","","환영","합니다", "!"])
;=> "연세대학교 보건 대학원에 오신것을 환영합니다!"

extract-phrases

(extract-phrases "한국어를 처리하는 예시입니다 ㅋㅋ")
;=> [{:text "한국어", :offset 0, :length 3}
;    {:text "처리", :offset 5, :length 2}
;    {:text "처리하는 예시", :offset 5, :length 7}
;    {:text "예시", :offset 10, :length 2}]
;   nil

(extract-phrases "한국어를 처리하는 예시입니다 ㅋㅋ" :as-strs true)
;=> ["한국어" "처리" "처리하는 예시" "예시"]

split-sentences

(split-sentences "가을이다! 남자는 가을을 탄다...... 그렇지? 루루야! 버버리코트 사러 가자!!!!")
;=> [{:text "가을이다!", :start 0, :end 5}
;    {:text "남자는 가을을 탄다......", :start 6, :end 22}
;    {:text "그렇지?", :start 23, :end 27}
;    {:text "루루야!", :start 28, :end 32}
;    {:text "버버리코트 사러 가자!!!!", :start 33, :end 48}]
;   nil

add-nouns-to-dictionary

(tokenize "불방망이")
;=> [{:text "불", :pos :Noun, :offset 0, :length 1, :unknown false} 
;    {:text "방망이", :pos :Noun, :offset 1, :length 3, :unknown false}]

(add-nouns-to-dictionary ["불방망이"])
;=> nil

(tokenize "불방망이")
;=> [{:text "불방망이", :pos :Noun, :offset 0, :length 4, :unknown false}]

add-words-to-dictionary

  • added in 0.2.3
(tokenize "그라믄")
;=> [{:text "그", :pos :Noun, :offset 0, :length 1, :unknown false} {:text "라", :pos :Josa, :offset 1, :length 1, :unknown false} {:text "믄", :pos :Modifier, :offset 2, :length 1, :unknown false}]

(add-words-to-dictionary KoreanPosJava/Conjunction ["그라믄"])
;=> nil

(tokenize "그라믄")
;=> [{:text "그라믄", :pos :Conjunction, :offset 0, :length 3, :unknown false}]

License

Copyright © 2018 Seonho Kim

Distributed under the Eclipse Public License either version 1.0 or any later version.