In [20]:
import csv
import os
import pandas as pd 

In [10]:
class BuildCustomData:
    def __init__(self):
        self.MIN_LENGTH_PHONE = 8
        
    def contains_letters(self, word):
        return any(char.isalpha() for char in word)
    
    def contains_numbers(self, word):
        return any(char.isdigit() for char in word)
        
    def count_numbers(self, word):
        i=0
        for char in word:
            if char.isdigit():
                i += 1
        return i
        
    def word_process(self, word):
        new_word = ""
        if self.contains_letters(word) or (not self.contains_numbers(word)):
            if word != "" and word[0].isdigit():
                last_type = "digit"
            elif word != "" and word[0].isalpha():
                last_type = "alpha"
            else:
                last_type = "other"
                
            for char in word:
                if (char.isdigit() and last_type == "digit") or (char.isalpha() and last_type == "alpha"): # the same type
                    new_word += char
                elif char.isdigit() and last_type != "digit":
                    new_word += " " + char
                    last_type = "digit"
                elif char.isalpha() and last_type != "alpha":
                    new_word += " " + char
                    last_type = "alpha"
                elif last_type == "other" and (char not in [":",",","،","."]):
                    new_word += char
                elif last_type == "alpha" and (char not in [":",",","،","."]):
                    new_word += " " + char
                    last_type = "other"
                elif last_type == "digit" :
                    new_word += char
                    last_type = "other"
        else:
            new_word = word
        return new_word.strip()
        
    def sentence_process(self, sentence) :
        words = sentence.split(' ')
        new_words = []
        for word in words:
            if word !="":
                new_word = self.word_process(word)
                if new_word != '':
                    for wrd in new_word.split(' '):
                        new_words.append(wrd)
            
        new_sentence = ""
        for index, word in enumerate(new_words):
            if not self.contains_letters(word):  # word = number or [+ ( ) - . , ...]
                if len(new_sentence) > 0 :
                    if self.contains_letters(new_sentence.split(' ')[-1]) : # or (not self.contains_numbers(new_sentence.split(' ')[-1])) or (not self.contains_numbers(word))
                        new_sentence += " "+word
                    elif self.contains_numbers(new_sentence.split(' ')[-1]) and self.count_numbers(new_sentence.split(' ')[-1]) > self.MIN_LENGTH_PHONE and self.count_numbers(word) > self.MIN_LENGTH_PHONE :
                        new_sentence += " "+word
                    else:
                        new_sentence += word
                else:
                    new_sentence += word
            else :
                if len(new_sentence) > 0 :
                    new_sentence += " "+word
                else :
                    new_sentence += word
        
        return new_sentence.strip()
        
    def phone_process_sentence(self, sentence):
        words = sentence.split(' ')
        new_sentence = ""
        for word in words:
            new_word = word
            if(self.contains_numbers(word) and len(word) > self.MIN_LENGTH_PHONE): # if word is a phone number
                separators = list(set([char for char in word if not char.isdigit()]))
                separators_min_len_elems = {}
                for sep in separators:
                    liste_elems = [elem for elem in word.split(sep) if elem.strip() != ""]
                    liste_len_elems = [self.count_numbers(elem) for elem in liste_elems]
                    test_result = all([l > self.MIN_LENGTH_PHONE for l in liste_len_elems]) and len(liste_len_elems)>1
                    
                    if test_result:
                        separators_min_len_elems[sep] = min(liste_len_elems)
                
                if len(separators_min_len_elems) > 0:
                    sep_has_max_min_len_elems = max(separators_min_len_elems, key=separators_min_len_elems.get)
                    new_word = f" {sep_has_max_min_len_elems} ".join([elem for elem in word.split(sep_has_max_min_len_elems) if elem.strip() != ""])
            
            new_sentence +=  " " + new_word
        return new_sentence.strip()
        
    def get_label_sentence(self, sentence) :
        words = sentence.split(' ')
        labels = []
        for word in words :
            if self.contains_numbers(word) and ( not self.contains_letters(word)):
                if self.count_numbers(word) > self.MIN_LENGTH_PHONE:
                    if len([c for c in word if c == '/']) != 2 and len(set([c for c in word if c.isdigit()])) > 2: # not a date & have more than 2 number type
                        label = 'PHONE'
                    else:
                        label = 'O'
                else:
                    label = 'O'
            else :
                label = 'O'
            labels.append(label)
        return labels

    def clean_phone(self, phone_number):
        start_index = -1;
        end_index = -1
        len_phone = len(phone_number)
        for i, char in enumerate(phone_number):
            if ( char.isdigit() or char in ["+","(","-"] ) and start_index == -1:
                start_index = i
                
            if ( phone_number[len_phone-1 - i].isdigit() or phone_number[len_phone-1 - i] == ")" ) and end_index == -1:
                end_index = len_phone - i
                
            if start_index != -1 and end_index != -1:
                break
        return phone_number[start_index:end_index]
        
    def get_items(self, row, column_sentence_name = "phrases") :   
        sentence_id = row['id']
        sentence = row[column_sentence_name]
        process_sentence = self.sentence_process(sentence)
        process_sentence = self.phone_process_sentence(process_sentence)
        label_sentence = self.get_label_sentence(process_sentence)
        sentence_split = process_sentence.split()
    
        items = []
    
        for word, label in zip(sentence_split, label_sentence):
            if label == "PHONE":
                word = self.clean_phone(word)
            word_info = {
                'sentence_id': sentence_id,
                'word': word.strip(),
                'label': label
            }
            items.append(word_info)
        
        return items

    def filter_items(self, items):
        return [item for item in items if self.contains_numbers(item["word"]) or self.contains_letters(item["word"])]
        
    def update_label(self, file_name, new_label, row_index):
        df = pd.read_csv(file_name)
        if row_index < 0 or row_index >= len(df):
            print("L'indice de ligne est invalide.")
        df.loc[row_index, 'label'] = new_label
        if new_label == "PHONE":
            word = df.loc[row_index, 'word']
            word = self.clean_phone(word)
            df.loc[row_index, 'word'] = word
        df.to_csv(file_name, index=False)
        print("La valeur de 'label' a été mise à jour avec succès.")
        
    def input_sentence_processing(self, sentence):
        process_sentence = self.sentence_process(sentence)
        process_sentence = self.phone_process_sentence(process_sentence)
        words = process_sentence.split()
        new_words = []
        for word in words:
            if self.count_numbers(word) > self.MIN_LENGTH_PHONE:
                word = self.clean_phone(word)
            if self.contains_numbers(word) or self.contains_letters(word):
                new_words.append(word)
        
        return " ".join(new_words)
        

In [11]:
sentences = [
    "للمزيد من المعلومات، الفاكس 356351257/363813833",
    "اتصل بنا على الرقم +212 5 44 55 66 77",
    "رقم الهاتف : +33 1 23 .45 67 89",
    "Pour contacter notre service clientèle, veuillez composer le 07 00-51 45 27 , hg yt",
    "Tele0566 44 33 22 ,",
    "Pour des conseils, appelez au)+ ( 226) 33-44-55-66-",
    "للحصول على مساعدة، اتصل بـ(069) 01 23 45."
]

In [12]:
buildCustomData = BuildCustomData()
sentence = sentences[5]

In [13]:
res = buildCustomData.sentence_process(sentence)
res

'Pour des conseils appelez au )+(226)33-44-55-66-'

In [14]:
phone = "#a)^)+(226)33-44-55-66-'*"
print(phone)
print(buildCustomData.clean_phone(phone))

#a)^)+(226)33-44-55-66-'*
+(226)33-44-55-66


In [15]:
res = buildCustomData.phone_process_sentence(res)
res

'Pour des conseils appelez au )+(226)33-44-55-66-'

In [16]:
res = buildCustomData.get_label_sentence(res)
res

['O', 'O', 'O', 'O', 'O', 'PHONE']

In [17]:
text=""
d ={"id":1,"phrases":sentence}
items = buildCustomData.get_items(d)
items

[{'sentence_id': 1, 'word': 'Pour', 'label': 'O'},
 {'sentence_id': 1, 'word': 'des', 'label': 'O'},
 {'sentence_id': 1, 'word': 'conseils', 'label': 'O'},
 {'sentence_id': 1, 'word': 'appelez', 'label': 'O'},
 {'sentence_id': 1, 'word': 'au', 'label': 'O'},
 {'sentence_id': 1, 'word': '+(226)33-44-55-66', 'label': 'PHONE'}]

In [18]:
buildCustomData.filter_items(items)

[{'sentence_id': 1, 'word': 'Pour', 'label': 'O'},
 {'sentence_id': 1, 'word': 'des', 'label': 'O'},
 {'sentence_id': 1, 'word': 'conseils', 'label': 'O'},
 {'sentence_id': 1, 'word': 'appelez', 'label': 'O'},
 {'sentence_id': 1, 'word': 'au', 'label': 'O'},
 {'sentence_id': 1, 'word': '+(226)33-44-55-66', 'label': 'PHONE'}]

In [19]:
buildCustomData.input_sentence_processing(sentence)

'Pour des conseils appelez au +(226)33-44-55-66'

In [73]:
for idx, row in df.iterrows():
    items = buildCustomData.get_items(row)
    print(items)
    break

[{'sentence_id': 1, 'word': 'رقم', 'label': 'O'}, {'sentence_id': 1, 'word': 'الهاتف', 'label': 'O'}, {'sentence_id': 1, 'word': '0522270033', 'label': 'PHONE'}]


In [75]:
class ItemStorage:
    def __init__(self, file_path):
        self.file_path = f"{file_path}.csv"
        self.fieldnames = ["sentence_id", "word", "label"]
        self.file = open(self.file_path, 'a', newline='', encoding='utf-8-sig')
        self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames)
        
        # Check if the file is empty, then write the header
        if self.file.tell() == 0:
            self.writer.writeheader()

    def insert_item(self, item):
        data = {
            'sentence_id': item['sentence_id'],
            'word': item['word'],
            'label': item['label']
        }

        self.writer.writerow(data)
    
    def insert_items(self, items) :
        for item in items :
            self.insert_item(item)

    def close_file(self):
        self.file.close()

In [21]:
######################################################## main ###########################################################
#########################################################################################################################

itemStorage = ItemStorage("items_3")
buildCustomData = BuildCustomData()

for idx, row in df.iterrows():
    items = buildCustomData.get_items(row)
    items = buildCustomData.filter_items(items)
    itemStorage.insert_items(items)

itemStorage.close_file()

In [62]:
df = pd.read_csv("sentences_from_web_2.csv")
df

Unnamed: 0,id,sentences
0,1,Nous Contacter - La Vie éco Accueil Au Royaume...
1,2,هيأة التحرير – كيفاش آخر ما كاينديرها في بالكر...
2,3,Contact - L'Observateur Contact - L'observateu...
3,4,للإشهار – كيفاش آخر ما كاينديرها في بالكرياضةم...
4,5,Website feedback | HIT RADIO Skip to main cont...
...,...,...
256,257,East Bay Times account password? Reset it here...
257,258,is published. Submit a letter to the Mercury N...
258,259,– Digital 408-271-3747 rkeith@bayareanewsgroup...
259,260,Judith Contra Costa County editor 925-779-7178...


In [63]:
itemStorage = ItemStorage("items_sentences_from_web_2_1")
buildCustomData = BuildCustomData()

for idx, row in df.iterrows():
    items = buildCustomData.get_items(row, column_sentence_name="sentences")
    items = buildCustomData.filter_items(items)
    itemStorage.insert_items(items)

itemStorage.close_file()

In [70]:
to_O = [1851, 1858, 1865, 1872, 2188, 5392, 5533, 8242, 12723, 12726, 19551,
        22812, 26106, 31070, 32751, 36871, 37873, 38202, 38401, 38419, 38433,
        38451, 38475, 38499, 38523, 38543, 38572, 38580, 38594, 38614, 38639, 38661, 38690,
        39224, 39819, 46327, 46612, 49296, 50708, 51175, 51678, 53970, 58263, 59551]

to_PHONE = [98]

In [None]:
prob = [98, ]
        

In [None]:
buildCustomData = BuildCustomData()
for index in to_O:
    buildCustomData.update_label("items_sentences_from_web_2_1.csv", "O", index -1)

In [71]:
for index in to_PHONE:
    buildCustomData.update_label("items_sentences_from_web_2_1.csv", "PHONE", index -1)

La valeur de 'label' a été mise à jour avec succès.


In [76]:
df = pd.read_csv("sentences.csv")
df

Unnamed: 0,id,phrases
0,1,رقم الهاتف: 0522270033
1,2,للاتصال بنا هاتفياً، يمكنكم الاتصال على الرقم:...
2,3,Tél : 0522221859 - Email :contact@geomedia.ma
3,4,Appelez nous au 0522221883 pour plus d’informa...
4,5,"Pour de l’assistance, appelez-nous au 0522213322"
...,...,...
995,996,"Pour obtenir des détails, appelez le (+212)706..."
996,997,إذا كانت لديك أسئلة، اتصل على (+212)707890123
997,998,"Pour toute assistance, appelez le (+212)708901234"
998,999,للحصول على دعم سريع، اتصل على (+212)709012345


In [77]:
itemStorage = ItemStorage("items_sentences_from_web_2_1")
buildCustomData = BuildCustomData()

sentence_id = 262

for idx, row in df.iterrows():
    items = buildCustomData.get_items(row)
    items = buildCustomData.filter_items(items)
    for item in items :
        item['sentence_id'] = sentence_id
        itemStorage.insert_item(item)
    sentence_id += 1

itemStorage.close_file()

In [21]:
df = pd.read_csv("items_sentences_from_web_2_1.csv")
df

Unnamed: 0,sentence_id,word,label
0,1,Nous,O
1,1,Contacter,O
2,1,La,O
3,1,Vie,O
4,1,éco,O
...,...,...,...
71188,1261,un,O
71189,1261,conseiller,O
71190,1261,appelez,O
71191,1261,le,O


In [25]:
df["label"].value_counts()

label
O        69700
PHONE     1493
Name: count, dtype: int64