In [77]:
# pip install spacy-langdetect

In [76]:
# pip install streamlit

In [73]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import re
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
import unicodedata
from sklearn.preprocessing import LabelEncoder
import sklearn
import pickle
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.models import load_model

#load stop words
file1 = open("/content/list.txt")
 
# Use this to read file content as a stream:
line = file1.read()
stop_words = line.split()

#Text Cleaning Final
def remove_punctuation(text):
  return re.sub(r'[^\w\s]+','',text)

def remove_english_and_numbers(text):
  return re.sub(r'[a-zA-Z\d+]+','',text)

def remove_escape_characters(text):
  text=re.sub(r'\n', '',text)
  text=re.sub(r'\_','',text)
  return text

def remove_non_arabic_non_ascii(text):
  nlp = spacy.load('en')  # 1
  nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
  text_content = text
  doc = nlp(text_content) #3
  detect_language = doc._.language #4
  if detect_language['language']=='ar':
    pass
  else:
      text=unicodedata.normalize('NFKD',text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
      text=remove_punctuation(text)
      text=remove_english_and_numbers(text)
  return text

def text2words(text):
   return text.split()

def remove_stopwords(words, stop_words):
  return [word for word in words if word not in stop_words]

def normalize_text(text):
    text=remove_punctuation(text)
    text=remove_english_and_numbers(text)
    text=remove_escape_characters(text)
    text=remove_non_arabic_non_ascii(text)
    words=text2words(text)
    words = remove_stopwords(words, stop_words)
    text=' '.join(words)
    return text

def detect_dialect(my_pred):
  if my_pred=='IQ':
    pred='اللهجة العراقية'
  elif  my_pred=='LY':
    pred='االلهجة الليبية'
  elif  my_pred=='QA':
    pred='اللهجة القطرية'
  elif  my_pred=='PL':
    pred='اللهجة الفلسطينية'
  elif  my_pred=='SY':
    pred='اللهجة السورية'
  elif  my_pred=='TN':
    pred='اللهجة التونسية'  
  elif  my_pred=='JO':
    pred='اللهجة الاردنية'
  elif  my_pred=='MA':
    pred='اللهجة المغربية'
  elif  my_pred=='SA':
    pred='اللهجة السعودية'
  elif  my_pred=='SA':
    pred='اللهجة السعودية'
  elif  my_pred=='YE':
    pred='اللهجة اليمنية'
  elif  my_pred=='DZ':
    pred='اللهجة الجزائرية'
  elif  my_pred=='EG':
    pred='اللهجة المصرية'
  elif  my_pred=='LB':
    pred='اللهجة اللبنانية'  
  elif  my_pred=='KW':
    pred='اللهجة الكويتية' 
  elif  my_pred=='OM':
    pred='اللهجة العمانية' 
  elif  my_pred=='SD':
    pred='اللهجة السودانية'
  elif  my_pred=='AE':
    pred='اللهجة الاماراتية'
  else:
    pred='اللهجة البحرينية'
  return pred

# loading the dataset
data = pd.read_csv("/content/cleaned_539.csv")
y = data["dialect"]
target_names =list(y.unique())

# label encoding
le = LabelEncoder()
y = le.fit_transform(y)
#loading the model and cv
nb_model = pickle.load(open("/content/nb_model.pkl", "rb"))
cv = pickle.load(open("/content/transform.pkl", "rb"))
dl_model=tf.keras.models.load_model('/content/dl_model.h5',compile=False)
with open('/content/tokenizer.pickle', 'rb') as handle:
    tokenizer= pickle.load(handle)

def main():
    selected_box = st.sidebar.selectbox(
    'اختر من الاتي',
    ('مرحبا','تحديد اللهجة')
    )
    
    if selected_box == 'مرحبا':
        welcome() 
    if selected_box == 'تحديد اللهجة':
        dialect()

def welcome():
    
    st.title('اهلا بك في برنامج تحديد اللهجات العربية')
    st.image('/content/download.jpg',use_column_width=True)

def dialect():
  st.subheader("برجاء ادخال الكلام باللغة العربية لتحديد اللهجة")
  text = st.text_input('') #text is stored in this variable
  text = normalize_text(text)
  dat = [text]
  option = st.selectbox('برجاء اختيار الموديل',
      ('Machine learning','Deep learning'))
  if option=='Machine learning':
    # creating the vector
    vect = cv.transform(dat).toarray()
    # prediction
    my_pred = nb_model.predict(vect)
    my_pred = le.inverse_transform(my_pred)
    my_pred=my_pred[0]
  else:
    text= tokenizer.texts_to_matrix(text,mode='tfidf')
    # transform data
    pred=dl_model.predict(text)
    pred=np.argmax(pred,axis=1)
    my_pred = le.inverse_transform(pred)
    my_pred=my_pred[0]   
  if st.button('توقع'):
      my_pred=detect_dialect(my_pred)
      st.text(my_pred)
if __name__ == "__main__":
    main()

Overwriting app.py


In [75]:
# !streamlit run app.py & npx localtunnel --port 8501