/
helper.py
145 lines (108 loc) · 3.94 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
License
-------
The MIT License (MIT)
Copyright (c) 2017 Tashkel Project
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
Created on Sat Dec 16 22:46:28 2017
@author: Ahmad Barqawi
"""
import os
import glob
import string
import re
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize
#convert using chr(harakat[0])
harakat = [1614,1615,1616,1618,1617,1611,1612,1613]
connector = 1617
def save_binary(data, file, folder):
location = os.path.join(folder, (file+'.pickle') )
with open(location, 'wb') as ff:
pickle.dump(data, ff, protocol=pickle.HIGHEST_PROTOCOL)
def load_binary(file, folder):
location = os.path.join(folder, (file+'.pickle') )
with open(location, 'rb') as ff:
data = pickle.load(ff)
return data
def get_sentences(data):
return [sent for line in re.split("[\n,،]+", data) if line for sent in sent_tokenize(line.strip()) if sent]
#return [sent for line in data.split('\n') if line for sent in sent_tokenize(line) if sent]
def clear_punctuations(text):
text = "".join(c for c in text if c not in string.punctuation)
return text
def clear_english_and_numbers(text):
text = re.sub(r"[a-zA-Z0-9٠-٩]", " ", text);
return text
def is_tashkel(text):
return any(ord(ch) in harakat for ch in text)
def clear_tashkel(text):
text = "".join(c for c in text if ord(c) not in harakat)
return text
def get_harakat():
return "".join(chr(item)+"|" for item in harakat)[:-1]
def get_taskel(sentence):
output = []
current_haraka = ""
for ch in reversed(sentence):
if ord(ch) in harakat:
if (current_haraka is "") or\
(ord(ch) == connector and chr(connector) not in current_haraka) or\
(chr(connector) == current_haraka):
current_haraka += ch
else:
if current_haraka == "":
current_haraka = "ـ"
output.insert(0, current_haraka)
current_haraka = ""
return output
def combine_text_with_harakat(input_sent, output_sent):
#print("input : " , len(input_sent))
#print("output : " , len(output_sent))
harakat_stack = Stack()
temp_stack = Stack()
#process harakat
for character, haraka in zip(input_sent, output_sent):
temp_stack = Stack()
haraka = haraka.replace("<UNK>","").replace("<PAD>","").replace("ـ","")
if (character == " " and haraka != "" and ord(haraka) == connector):
combine = harakat_stack.pop()
combine += haraka
harakat_stack.push(combine)
else:
harakat_stack.push(haraka)
#fix combine differences
input_length = len(input_sent)
output_length = harakat_stack.size()
for index in range(0,(input_length-output_length)):
harakat_stack.push("")
#combine with text
text = ""
for character, haraka in zip(input_sent, harakat_stack.to_array()):
text += character + "" + haraka
return text
class Stack:
def __init__(self):
self.stack = []
def isEmpty(self):
return self.size() == 0
def push(self, item):
self.stack.append(item)
def pop(self):
return self.stack.pop()
def peek(self):
if self.size() == 0:
return None
else:
return self.stack[len(self.stack)-1]
def size(self):
return len(self.stack)
def to_array(self):
return self.stack