Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support persian characters #4

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions cases_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package garabic

//removeHarakatTestCases contains all test cases for TestRemoveHarakat function
// removeHarakatTestCases contains all test cases for TestRemoveHarakat function
var removeHarakatTestCases = []struct {
description string
input string
Expand Down Expand Up @@ -34,7 +34,7 @@ var removeHarakatTestCases = []struct {
},
}

//normalizeTestCases contains all test cases for TestNormalize function
// normalizeTestCases contains all test cases for TestNormalize function
var normalizeTestCases = []struct {
description string
input string
Expand Down Expand Up @@ -97,7 +97,7 @@ var normalizeTestCases = []struct {
},
}

//spellNumberTestCases contains all test cases for reading a number in arabic
// spellNumberTestCases contains all test cases for reading a number in arabic
var spellNumberTestCases = []struct {
input int
expected string
Expand Down Expand Up @@ -208,7 +208,7 @@ var spellNumberTestCases = []struct {
},
}

//tashkeelTestCases contains all test cases for adding tashkeel to arabic text
// tashkeelTestCases contains all test cases for adding tashkeel to arabic text
var tashkeelTestCases = []struct {
description string
input string
Expand All @@ -226,12 +226,17 @@ var tashkeelTestCases = []struct {
},
}

//shapingTestCases contains all test cases for shaping arabic text
// shapingTestCases contains all test cases for shaping arabic text
var shapingTestCases = []struct {
description string
input string
expected string
}{
{
"Shaping lam alef",
"السلام علیکم",
"ﻢﮑﻴﻠﻋ ﻡﻼﺴﻟا",
},
{
"Shaping 1 word without tashkeel",
"بالعربي",
Expand All @@ -252,15 +257,14 @@ var shapingTestCases = []struct {
"قِفا نَبكِ مِن ذِكرى حَبيبٍ وَمَنزِلِ **** بِسِقطِ اللِوى بَينَ الدَخولِ فَحَومَلِ",
"ِﻞَﻣﻮَﺤَﻓ ِﻝﻮﺧَﺪﻟا َﻦﻴَﺑ ﻯﻮِﻠﻟا ِﻂﻘِﺴِﺑ **** ِﻝِﺰﻨَﻣَو ٍﺐﻴﺒَﺣ ﻯﺮﻛِذ ﻦِﻣ ِﻚﺒَﻧ ﺎﻔِﻗ",
},

{
"Shaping 1 word without tashkeel",
"المصفوفة (Multidimentional Array) هي",
"ﻲﻫ (Multidimentional Array) ﺔﻓﻮﻔﺼﻤﻟا",
},
}

//arabicLetterTestCases
// arabicLetterTestCases
var arabicLetterTestCases = []struct {
description string
input rune
Expand Down
107 changes: 83 additions & 24 deletions garabic.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//Package garabic provides a set of functions for Arabic text processing in golang
// Package garabic provides a set of functions for Arabic text processing in golang
package garabic

import (
Expand All @@ -11,20 +11,20 @@ import (
"golang.org/x/text/transform"
)

//letterGroup represents the letter and bounding letters
// letterGroup represents the letter and bounding letters
type letterGroup struct {
backLetter rune
letter rune
frontLetter rune
}

//letterShape represents all shapes of arabic letters in a word
// letterShape represents all shapes of arabic letters in a word
// https://web.stanford.edu/dept/lc/arabic/alphabet/incontextletters.html
type letterShape struct {
Independent, Initial, Medial, Final rune
}

//Map of different shapes of arabic alphabet
// Map of different shapes of arabic alphabet
var arabicAlphabetShapes = map[rune]letterShape{
// Letter (ﺃ)
'\u0623': {Independent: '\uFE83', Initial: '\u0623', Medial: '\uFE84', Final: '\uFE84'},
Expand All @@ -42,6 +42,8 @@ var arabicAlphabetShapes = map[rune]letterShape{
'\u0626': {Independent: '\uFE89', Initial: '\uFE8B', Medial: '\uFE8C', Final: '\uFE8A'},
// Letter (ﺏ)
'\u0628': {Independent: '\uFE8F', Initial: '\uFE91', Medial: '\uFE92', Final: '\uFE90'},
// Letter (پ)
'\u067e': {Independent: '\uFB56', Initial: '\uFB58', Medial: '\uFB59', Final: '\uFB57'},
// Letter (ﺕ)
'\u062A': {Independent: '\uFE95', Initial: '\uFE97', Medial: '\uFE98', Final: '\uFE96'},
// Letter (ﺓ)
Expand All @@ -50,6 +52,8 @@ var arabicAlphabetShapes = map[rune]letterShape{
'\u062B': {Independent: '\uFE99', Initial: '\uFE9B', Medial: '\uFE9C', Final: '\uFE9A'},
// Letter (ﺝ)
'\u062C': {Independent: '\uFE9D', Initial: '\uFE9F', Medial: '\uFEA0', Final: '\uFE9E'},
// Letter (چ)
'\u0686': {Independent: '\uFB7A', Initial: '\uFB7C', Medial: '\uFB7D', Final: '\uFB7B'},
// Letter (ﺡ)
'\u062D': {Independent: '\uFEA1', Initial: '\uFEA3', Medial: '\uFEA4', Final: '\uFEA2'},
// Letter (ﺥ)
Expand All @@ -62,6 +66,8 @@ var arabicAlphabetShapes = map[rune]letterShape{
'\u0631': {Independent: '\uFEAD', Initial: '\u0631', Medial: '\uFEAE', Final: '\uFEAE'},
// Letter (ﺯ)
'\u0632': {Independent: '\uFEAF', Initial: '\u0632', Medial: '\uFEB0', Final: '\uFEB0'},
// Letter (ژ)
'\u0698': {Independent: '\uFB8A', Initial: '\uFB8A', Medial: '\uFB8B', Final: '\uFB8B'},
// Letter (ﺱ)
'\u0633': {Independent: '\uFEB1', Initial: '\uFEB3', Medial: '\uFEB4', Final: '\uFEB2'},
// Letter (ﺵ)
Expand All @@ -84,6 +90,10 @@ var arabicAlphabetShapes = map[rune]letterShape{
'\u0642': {Independent: '\uFED5', Initial: '\uFED7', Medial: '\uFED8', Final: '\uFED6'},
// Letter (ﻙ)
'\u0643': {Independent: '\uFED9', Initial: '\uFEDB', Medial: '\uFEDC', Final: '\uFEDA'},
// Letter (ک)
'\u06a9': {Independent: '\ufb8e', Initial: '\ufb90', Medial: '\ufb91', Final: '\ufb8f'},
// Letter (گ)
'\u06AF': {Independent: '\ufb92', Initial: '\ufb94', Medial: '\ufb95', Final: '\ufb93'},
// Letter (ﻝ)
'\u0644': {Independent: '\uFEDD', Initial: '\uFEDF', Medial: '\uFEE0', Final: '\uFEDE'},
// Letter (ﻡ)
Expand All @@ -98,6 +108,8 @@ var arabicAlphabetShapes = map[rune]letterShape{
'\u064A': {Independent: '\uFEF1', Initial: '\uFEF3', Medial: '\uFEF4', Final: '\uFEF2'},
// Letter (ﻯ)
'\u0649': {Independent: '\uFEEF', Initial: '\u0649', Medial: '\uFEF0', Final: '\uFEF0'},
// Letter (ﻯ)
'\u06cc': {Independent: '\ufeef', Initial: '\ufef3', Medial: '\ufef4', Final: '\ufef0'},
// Letter (ـ)
'\u0640': {Independent: '\u0640', Initial: '\u0640', Medial: '\u0640', Final: '\u0640'},
// Letter (ﻻ)
Expand Down Expand Up @@ -151,6 +163,8 @@ const (
AlefHamzaBelow = '\u0625'
//Yae => ي
Yae = '\u064A'
//Yeh => ي
Yeh = '\u06cc'
//DotlessYae => ى
DotlessYae = '\u0649'
//TehMarbuta => ة
Expand All @@ -161,7 +175,7 @@ const (
AlefWaslah = '\u0671'
)

//Number groups in Arabic
// Number groups in Arabic
var _zeroToNine = []string{
"صفر", "واحد", "اثنان", "ثلاثة", "أربعة",
"خمسة", "ستة", "سبعة", "ثمانية", "تسعة",
Expand All @@ -183,7 +197,7 @@ var _scaleNumbers = []string{
"", "ألف", "مليون", "مليار",
}

//RemoveHarakat will remove harakat from arabic text
// RemoveHarakat will remove harakat from arabic text
func RemoveHarakat(input string) string {
input = normalizeTransform(input)
runes := bytes.Runes([]byte(input))
Expand All @@ -198,7 +212,7 @@ func RemoveHarakat(input string) string {
return string(runes)
}

//Normalize will prepare an arabic text to search and index
// Normalize will prepare an arabic text to search and index
func Normalize(input string) string {
input = normalizeTransform(input)
runes := bytes.Runes([]byte(input))
Expand Down Expand Up @@ -226,7 +240,7 @@ func normalizeTransform(input string) string {
return input
}

//deleteRune will delete a rune from the slice while keeping the order of runes
// deleteRune will delete a rune from the slice while keeping the order of runes
func deleteRune(runes []rune, i int) []rune {
if i >= len(runes) {
return runes
Expand Down Expand Up @@ -291,7 +305,6 @@ func SpellNumber(input int) string {
}
case 1:
stringOfNum = append(stringOfNum, _elevenToNineteen[zeros])
break
default:
if zeros > 0 {
word := fmt.Sprintf("و %s و %s", _zeroToNine[zeros], _tens[tens])
Expand All @@ -303,7 +316,6 @@ func SpellNumber(input int) string {
stringOfNum = append(stringOfNum, _tens[tens])
}
}
break
}

// Scale position
Expand Down Expand Up @@ -340,7 +352,7 @@ func contains(s []string, str string) bool {
return false
}

//Shape will reconstruct arabic text to be connected correctly
// Shape will reconstruct arabic text to be connected correctly
func Shape(input string) string {
var langSections []string
var continousLangAr string
Expand All @@ -362,11 +374,11 @@ func Shape(input string) string {
}
}
if len(continousLangLt) > 0 {
fmt.Println(continousLangLt)
// fmt.Println(continousLangLt)
langSections = append(langSections, strings.TrimSpace(continousLangLt))
}
if len(continousLangAr) > 0 {
fmt.Printf("\"%s\"\n", continousLangAr)
// fmt.Printf("\"%s\"\n", continousLangAr)
langSections = append(langSections, strings.TrimSpace(continousLangAr))
}

Expand All @@ -388,7 +400,28 @@ func Shape(input string) string {
return strings.Join(shapedSentence, " ")
}

//shapeWord will reconstruct an arabic word to be connected correctly
func fixLamAlef(group letterGroup) rune {
switch group.letter {
case '\u0644': // lam
switch group.frontLetter {
case '\u0623': // alef hamze above
return '\uFEF7'
case '\u0627': // alef
return '\uFEFB'
case '\u0625': // alef hamze below
return '\uFEF9'
case '\u0622': // alef madd
return '\uFEF5'
}
case '\u0623', '\u0627', '\u0625', '\u0622': // alef types
if group.backLetter == '\u0644' { // lam
return 0
}
}
return group.letter
}

// shapeWord will reconstruct an arabic word to be connected correctly
func shapeWord(input string) string {
if !IsArabic(input) {
return input
Expand All @@ -398,6 +431,7 @@ func shapeWord(input string) string {

//Convert input into runes
inputRunes := []rune(RemoveHarakat(input))
countIgnored := 0
for i := range inputRunes {
//Get Bounding back and front letters
var backLetter, frontLetter rune
Expand All @@ -410,14 +444,18 @@ func shapeWord(input string) string {
//Fix the letter based on bounding letters
if _, ok := arabicAlphabetShapes[inputRunes[i]]; ok {
adjustedLetter := adjustLetter(letterGroup{backLetter, inputRunes[i], frontLetter})
shapedInput.WriteRune(adjustedLetter)
if adjustedLetter != 0 {
shapedInput.WriteRune(adjustedLetter)
} else {
countIgnored++
}
} else {
shapedInput.WriteRune(inputRunes[i])
}
}

//In case no Tashkeel deteted, same size of runes
if len([]rune(shapedInput.String())) == len([]rune(input)) {
if len([]rune(shapedInput.String())) == len([]rune(input))-countIgnored {
return reverse(shapedInput.String())
}

Expand All @@ -439,7 +477,7 @@ func shapeWord(input string) string {

}

//reverse the arabic string for RTL support in rendering
// reverse the arabic string for RTL support in rendering
func reverse(s string) string {
runes := []rune(s)
for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
Expand All @@ -448,8 +486,13 @@ func reverse(s string) string {
return string(runes)
}

//adjustLetter will adjust the arabic letter depending on its position
// adjustLetter will adjust the arabic letter depending on its position
func adjustLetter(g letterGroup) rune {
g.letter = fixLamAlef(g)

if g.letter == 0 {
return 0
}

switch {
//Inbetween 2 letters
Expand All @@ -475,9 +518,9 @@ func adjustLetter(g letterGroup) rune {
}
}

//Check if the letter is always .Initial
// Check if the letter is always .Initial
func isAlwaysInitial(letter rune) bool {
alwaysInitial := [13]rune{'\u0627', '\u0623', '\u0622', '\u0625', '\u0649', '\u0621', '\u0624', '\u0629', '\u062f', '\u0630', '\u0631', '\u0632', '\u0648'}
alwaysInitial := [14]rune{'\u0627', '\u0623', '\u0622', '\u0625', '\u0649', '\u0621', '\u0624', '\u0629', '\u062f', '\u0630', '\u0631', '\u0632', '\u0648', '\u0698'}
for _, item := range alwaysInitial {
if item == letter {
return true
Expand All @@ -486,12 +529,12 @@ func isAlwaysInitial(letter rune) bool {
return false
}

//IsArabicLetter checks if the letter is arabic
// IsArabicLetter checks if the letter is arabic
func IsArabicLetter(ch rune) bool {
return (ch >= 0x600 && ch <= 0x6FF)
}

//IsArabic checks if the input string contains arabic unicode only
// IsArabic checks if the input string contains arabic unicode only
func IsArabic(input string) bool {

var isArabic = true
Expand All @@ -503,7 +546,7 @@ func IsArabic(input string) bool {
return isArabic
}

//ToArabicDigits will convert english numbers to arabic numbers in text
// ToArabicDigits will convert english numbers to arabic numbers in text
func ToArabicDigits(input string) string {
return strings.NewReplacer(
"0", "٠",
Expand All @@ -519,7 +562,23 @@ func ToArabicDigits(input string) string {
).Replace(input)
}

//ToEnglishDigits will convert arabic numbers to english numbers in text
// ToPersianDigits will convert english numbers to persian numbers in text
func ToPersianDigits(input string) string {
return strings.NewReplacer(
"0", "٠",
"1", "١",
"2", "٢",
"3", "٣",
"4", "۴",
"5", "۵",
"6", "۶",
"7", "٧",
"8", "٨",
"9", "٩",
).Replace(input)
}

// ToEnglishDigits will convert arabic numbers to english numbers in text
func ToEnglishDigits(input string) string {
return strings.NewReplacer(
"٠", "0",
Expand Down
Loading