#### Deep learning training
   1. [Reading the data (phase 2)](#phase2)
   2. [convert the dialgect to numbers](#convert)
   3. [Split the data to trainging and test data](#Split)
   4. [tokenize the tweets and convert the labels to numbers](#tokenize)
   5. [loading mazajak word embedding and training the model](#loading_mazajak)
   6. [loading AraVec word embedding and training the model](#loading_AraVec)
   7. [pretrained word embedding (mazajak)](#pretrained_word_embedding_mazajak)
   8. [pretrained word embedding (AraVec)](#pretrained_word_embedding_AraVec)
   9. [LSTM from scratch](#LSTM) 
   10. [Embeddding layer without LSTM from scratch](#Embeddding)
   11. [Fine tuning pretrained word embedding(AraVec)](#Fine_tuning_pretrained_word_embedding_AraVec)
   12. [Fine tuning pretrained word embedding(mazajak)](#Fine_tuning_pretrained_word_embedding_mazajak)
   13. [loading DL model](#loading)
   14. [testing DL models](#testing_DL)
   
#### Machine learning training  
   1. [Cross validation to choose the best model](#Cross)
   2. [Linear SVM](#SVM)
   3. [testing ML models](#testing_ML)
   4. [ML vs DL](#vs)

In [29]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU,Dense, Dropout, SpatialDropout1D, GlobalAveragePooling1D, LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from sklearn.pipeline import Pipeline
from joblib import dump
import tensorflow
import gensim
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from dicts import preprocess, country_codes

### reading preprocessed data<a id='phase2'></a>

In [7]:
dataset = pd.read_csv("preprocessed_data.csv")

In [8]:
# removing one bad data point where the entire tweet is english
dataset = dataset.drop(dataset['tweet'][dataset['pure_tweet'].isnull()].index)

### converting the labels to numbers<a id='convert'></a>

In [9]:
dataset['dialect_number'] = dataset['dialect'].factorize()[0]
outputs = dict(zip(dataset['dialect_number'], dataset['dialect']))

### splitting the data to train, validation and test <a id='Split'></a>

In [10]:
X_train, X_other, y_train, y_other = train_test_split(dataset, dataset['dialect_number'],test_size = 0.2, random_state =0)
X_val, X_test, y_val, y_test = train_test_split(X_other, y_other,test_size = 0.5, random_state =0)

### tokenize the tweets and convert the labels to numbers<a id='tokenize'></a>

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['pure_tweet'])
encoded_docs = tokenizer.texts_to_sequences(X_train['pure_tweet'])
y= to_categorical(y_train,num_classes=18)
padded_sequence = pad_sequences(encoded_docs, maxlen=60, padding='post')
y_val_categorical= to_categorical(y_val,num_classes=18)
vocab_size = len(tokenizer.word_index)+1

In [13]:
val_tweets = tokenizer.texts_to_sequences(X_val['pure_tweet'])
val_padded_sequence = pad_sequences(val_tweets, maxlen=60)

In [7]:
vocab_size

167970

### Mazajak Pretrained word embedding<a id='loading_mazajak'></a>

In [8]:
embeddings_Mazajak = gensim.models.KeyedVectors.load_word2vec_format('cbow_100.bin',binary=True,unicode_errors='ignore')

In [9]:
embedding_matrix_Mazajak = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = embeddings_Mazajak[word]
        if embedding_vector is not None:
            embedding_matrix_Mazajak[i] = embedding_vector
    except:
        continue

### AraVec word embedding<a id='loading_AraVec'></a>

In [10]:
embeddings_AraVec = gensim.models.Word2Vec.load('full_uni_cbow_100_twitter.mdl')

In [11]:
embedding_matrix_AraVec = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = embeddings_AraVec[word]
        if embedding_vector is not None:
            embedding_matrix_AraVec[i] = embedding_vector
    except:
        continue

#### Callbacks(early stopping)

In [32]:
es = EarlyStopping(monitor='val_loss', verbose=1, patience=2, min_delta= .1)

### experiement 1<a id='pretrained_word_embedding_mazajak'></a>
#### LSTM with fixed pretrained word embedding (mazajak)

In [20]:
embedding_vector_length = 300
model_mazajak_LSTM = Sequential()
model_mazajak_LSTM.add(Embedding(vocab_size, embedding_vector_length, weights=[embedding_matrix_Mazajak], trainable=False))
model_mazajak_LSTM.add(LSTM(32, dropout=0.5, recurrent_dropout=0.5)) 
model_mazajak_LSTM.add(Dropout(0.2))
model_mazajak_LSTM.add(Dense(18, activation='softmax'))
model_mazajak_LSTM.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) #binary_crossentropy

In [22]:
mc = ModelCheckpoint('best_model_mazajak_LSTM.h5', monitor='val_accuracy', verbose=1, save_best_only=True)
history_mazajak_LSTM = model_mazajak_LSTM.fit(padded_sequence,y,validation_data=(val_padded_sequence, y_val_categorical),
                                              epochs=10, batch_size=32, callbacks=[es, mc])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.28055, saving model to best_model_mazajak_LSTM.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.28055 to 0.31399, saving model to best_model_mazajak_LSTM.h5
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.31399
Epoch 00003: early stopping


### experiement 2<a id='pretrained_word_embedding_AraVec'></a>
#### LSTM with fixed pretrained word embedding (AraVec)

In [24]:
embedding_vector_length = 100
model_AraVec_LSTM = Sequential()
model_AraVec_LSTM.add(Embedding(vocab_size, embedding_vector_length, weights=[embedding_matrix_AraVec], trainable=False))
model_AraVec_LSTM.add(LSTM(32, dropout=0.5, recurrent_dropout=0.5)) 
model_AraVec_LSTM.add(Dropout(0.2))
model_AraVec_LSTM.add(Dense(18, activation='softmax'))
model_AraVec_LSTM.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) #binary_crossentropy

In [25]:
mc = ModelCheckpoint('best_model_AraVec_LSTM.h5', monitor='val_accuracy', verbose=1, save_best_only=True)
history_AraVec_LSTM = model_AraVec_LSTM.fit(padded_sequence,y, validation_data=(val_padded_sequence, y_val_categorical),
                                            epochs=10, batch_size=32, callbacks=[es, mc])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.12543, saving model to best_model_AraVec_LSTM.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.12543
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.12543
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.12543
Epoch 00004: early stopping


### experiment 3  <a id='LSTM'></a>
LSTM from scratch

In [28]:
embedding_vector_length = 100
model_scratch = Sequential()
model_scratch.add(Embedding(vocab_size, embedding_vector_length))
model_scratch.add(LSTM(32, dropout=0.5, recurrent_dropout=0.5)) 
model_scratch.add(Dropout(0.2))
model_scratch.add(Dense(18, activation='softmax'))
model_scratch.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

In [29]:
mc = ModelCheckpoint('best_model_from_scratch.h5', monitor='val_accuracy', verbose=1, save_best_only=True)
history_scratch = model_scratch.fit(padded_sequence,y, validation_data=(val_padded_sequence, y_val_categorical)
                                    , epochs=10, batch_size=32, callbacks=[es, mc])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.12551, saving model to best_model_from_scratch.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.12551 to 0.17730, saving model to best_model_from_scratch.h5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.17730 to 0.37381, saving model to best_model_from_scratch.h5
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.37381 to 0.43311, saving model to best_model_from_scratch.h5
Epoch 5/10

Epoch 00005: val_accuracy improved from 0.43311 to 0.45439, saving model to best_model_from_scratch.h5
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.45439
Epoch 7/10

Epoch 00007: val_accuracy improved from 0.45439 to 0.45589, saving model to best_model_from_scratch.h5
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.45589
Epoch 00008: early stopping


### experiment 4 <a id='Embeddding'></a>
Embeddding layer without LSTM from scratch

In [33]:
embedding_vector_length = 100
model_embeddding = Sequential()
model_embeddding.add(Embedding(vocab_size, embedding_vector_length))
model_embeddding.add(GlobalAveragePooling1D())

model_embeddding.add(Dropout(0.2))
model_embeddding.add(Dense(18, activation='softmax'))
model_embeddding.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) #binary_crossentropy

In [34]:
mc = ModelCheckpoint('best_model_from_Embeddding.h5', monitor='val_accuracy', verbose=1, save_best_only=True)
history_embeddding = model_embeddding.fit(padded_sequence,y, validation_data=(val_padded_sequence, y_val_categorical)
                                          , epochs=10, batch_size=32, callbacks=[es, mc])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.47204, saving model to best_model_from_Embeddding.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.47204 to 0.51063, saving model to best_model_from_Embeddding.h5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.51063 to 0.51986, saving model to best_model_from_Embeddding.h5
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.51986 to 0.52383, saving model to best_model_from_Embeddding.h5
Epoch 00004: early stopping


### experiment 5 <a id='Fine_tuning_pretrained_word_embedding_AraVec'></a>
Fine tuning pretrained word embedding(AraVec)

In [35]:
embedding_vector_length = 100
model_finetune_AraVec = Sequential()
model_finetune_AraVec.add(Embedding(vocab_size, embedding_vector_length, weights=[embedding_matrix_AraVec]))
model_finetune_AraVec.add(GlobalAveragePooling1D())
model_finetune_AraVec.add(Dropout(0.2))
model_finetune_AraVec.add(Dense(18, activation='softmax'))
model_finetune_AraVec.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

In [36]:
mc = ModelCheckpoint('best_model_finetune_AraVec.h5', monitor='val_accuracy', verbose=1, save_best_only=True)
history_finetune_AraVec = model_finetune_AraVec.fit(padded_sequence,y, validation_data=(val_padded_sequence, y_val_categorical),
                                               epochs=10, batch_size=32, callbacks=[es, mc])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.46809, saving model to best_model_finetune_AraVec.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.46809 to 0.50670, saving model to best_model_finetune_AraVec.h5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.50670 to 0.52106, saving model to best_model_finetune_AraVec.h5
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.52106 to 0.52407, saving model to best_model_finetune_AraVec.h5
Epoch 00004: early stopping


### experiment 6 <a id='Fine_tuning_pretrained_word_embedding_mazajak'></a>
Fine tuning pretrained word embedding(mazajak)

In [37]:
embedding_vector_length = 300
model_finetune_mazajak = Sequential()
model_finetune_mazajak.add(Embedding(vocab_size, embedding_vector_length, weights=[embedding_matrix_Mazajak]))
model_finetune_mazajak.add(GlobalAveragePooling1D())
model_finetune_mazajak.add(Dropout(0.2))
model_finetune_mazajak.add(Dense(18, activation='softmax'))
model_finetune_mazajak.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

In [38]:
mc = ModelCheckpoint('best_model_finetune_mazajak.h5', monitor='val_accuracy', verbose=1, save_best_only=True)
history_finetune_mazajak = model_finetune_mazajak.fit(padded_sequence,y, validation_data=(val_padded_sequence, y_val_categorical),
                                               epochs=10, batch_size=32, callbacks=[es, mc])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.48608, saving model to best_model_finetune_mazajak.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.48608 to 0.51327, saving model to best_model_finetune_mazajak.h5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.51327 to 0.51990, saving model to best_model_finetune_mazajak.h5
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.51990 to 0.52665, saving model to best_model_finetune_mazajak.h5
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.52665
Epoch 00005: early stopping


### Load models<a id='loading'></a>

In [2]:
best_model_from_Embeddding = tensorflow.keras.models.load_model("best_model_from_Embeddding.h5")
best_model_from_scratch = tensorflow.keras.models.load_model("best_model_from_scratch.h5")

best_model_mazajak_LSTM = tensorflow.keras.models.load_model("best_model_mazajak_LSTM.h5")
best_model_AraVec_LSTM = tensorflow.keras.models.load_model("best_model_AraVec_LSTM.h5")

best_model_finetune_mazajak = tensorflow.keras.models.load_model("best_model_finetune_mazajak.h5")
best_model_finetune_AraVec = tensorflow.keras.models.load_model("best_model_finetune_AraVec.h5")
deep_learning_models = {'best_model_from_Embeddding':best_model_from_Embeddding,
                       'best_model_from_scratch':best_model_from_scratch,
                       'best_model_finetune_AraVec':best_model_finetune_AraVec,
                       'best_model_finetune_mazajak':best_model_finetune_mazajak,
                       'best_model_mazajak_LSTM':best_model_mazajak_LSTM,
                       'best_model_AraVec_LSTM':best_model_AraVec_LSTM}

### Testing DL models<a id='testing_DL'></a>

#### tokenize and pad test data

In [46]:
test_tweets = tokenizer.texts_to_sequences(X_test['pure_tweet'])
test_padded_sequence = pad_sequences(test_tweets, maxlen=60)

### on validation data

In [14]:
for model_name, model in deep_learning_models.items():    
    f = model.predict(val_padded_sequence)
    print(model_name,' accuracy: ',np.mean(list(map(np.argmax,f))==y_val), '|| F1-score: ', f1_score(y_val, np.argmax(f,axis=1), average='macro'))

best_model_from_Embeddding  accuracy:  0.5238323876036666 || F1-score:  0.4941769939744494
best_model_from_scratch  accuracy:  0.45589262330859887 || F1-score:  0.3992942075421435
best_model_finetune_AraVec  accuracy:  0.524072457442165 || F1-score:  0.49355732044822276
best_model_finetune_mazajak  accuracy:  0.5266477520733305 || F1-score:  0.4977199961273737
best_model_mazajak_LSTM  accuracy:  0.31398952422522913 || F1-score:  0.1816254750301616
best_model_AraVec_LSTM  accuracy:  0.1254255783500655 || F1-score:  0.01238302704356576


### on test data

In [48]:
test_pred = model_finetune_mazajak.predict(test_padded_sequence)
print('accuracy: ', np.mean(list(map(np.argmax,test_pred))==y_test),' ||F1 score: ', f1_score(y_test, np.argmax(test_pred,axis=1), average='macro'))

accuracy:  0.5288302051505893  ||F1 score:  0.5024157162774612


In [49]:
def test_dl(index, model, padding_max_len):
    tw = tokenizer.texts_to_sequences([X_test['pure_tweet'].iloc[index]])
    tw = pad_sequences(tw, maxlen=padding_max_len)
    prediction = np.argmax(model.predict(tw))
    print(X_test['tweet'].iloc[index])
    print("ground truth: ", X_test['dialect'].iloc[index])
    print("Predicted label: ", outputs[prediction], ' with propability: ', np.max(model.predict(tw)))

In [50]:
index = 1
test_dl(index= index, model= best_model_from_Embeddding, padding_max_len=60)

@razan_alshamali عشان اصورك صور حلوة اليوم 😜😜😂😂😂
ground truth:  PL
Predicted label:  PL  with propability:  0.17330182


## Machine learning<a id='testing_DL'></a>

### Cross validation to choose the best model<a id='Cross'></a>

In [39]:
models = [
    LinearSVC(),
    MultinomialNB(),
    SGDClassifier(random_state=0),
]

In [40]:
CV = 3
cv_df = pd.DataFrame(index=range(CV * len(models)))

In [41]:
pipe = Pipeline([('count', CountVectorizer()),
                 ('tfid', TfidfTransformer())]).fit(X_train['pure_tweet'])
pipe_train = pipe.transform(X_train['pure_tweet'])

In [42]:
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, pipe.transform(X_train['pure_tweet']), y_train, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
        print(model_name, fold_idx, accuracy)
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

LinearSVC 0 0.499795393907649
LinearSVC 1 0.49803985759299424
LinearSVC 2 0.5005278880386299
MultinomialNB 0 0.3594028775800828
MultinomialNB 1 0.35778532553095715
MultinomialNB 2 0.35985595613209476
SGDClassifier 0 0.46497962123320186
SGDClassifier 1 0.46164422801489546
SGDClassifier 2 0.46435323484879487


Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LinearSVC,0.499454,0.001279
MultinomialNB,0.359015,0.001089
SGDClassifier,0.463659,0.001773


### Linear SVM with tf-idf 2-grams<a id='SVM'></a>

In [15]:
two_gram_svm = Pipeline([
    ('tfidf',TfidfVectorizer(ngram_range=(1,2))),
    ('clf', LinearSVC()),
])
two_gram_svm.fit(X_train['pure_tweet'], y_train)
uni_gram_svm = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf', LinearSVC()),
])
uni_gram_svm.fit(X_train['pure_tweet'], y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

### testing ML on validation data<a id='testing_ML'></a>

In [16]:
# Accuracy
print("uni-gram")
print("accuracy: ", uni_gram_svm.score(X_val['pure_tweet'], y_val))
print("macro F1 score: ", f1_score(y_val, uni_gram_svm.predict(X_val['pure_tweet']), average='macro'))
print("2-gram")
print("accuracy: ", two_gram_svm.score(X_val['pure_tweet'], y_val))
print("macro F1 score: ", f1_score(y_val, two_gram_svm.predict(X_val['pure_tweet']), average='macro'))

uni-gram
accuracy:  0.512461807071148
macro F1 score:  0.47850441725394727
2-gram
accuracy:  0.538018332605849
macro F1 score:  0.5070792748737503


### testing the final model on test set

In [27]:
print("accuracy: ", two_gram_svm.score(X_test['pure_tweet'], y_test))
print("macro F1 score: ", f1_score(y_test, two_gram_svm.predict(X_test['pure_tweet']), average='macro'))

accuracy:  0.5388476647752073
macro F1 score:  0.5072461509724954


### saving the model

In [36]:
joblib.dump(two_gram_svm, filename= 'two_gram_svm.joblib')
loaded_model = joblib.load('two_gram_svm.joblib')

In [34]:
def test_ml(model, index, type_of_model= 'other'):
    if type_of_model =='train':
        filtered_value = X_train['pure_tweet'].iloc[index]
        print(X_train['tweet'].iloc[index])
        print("prediction: ", model.predict([filtered_value])[0], " ", outputs[model.predict([filtered_value])[0]])
        print("Ground truth: ", X_train['dialect_number'].iloc[index], " ", X_train['dialect'].iloc[index], "\n")
    elif type_of_model =='test':
        filtered_value = X_test['pure_tweet'].iloc[index]
        print(X_test['tweet'].iloc[index])
        print("prediction: ",model.predict([filtered_value])[0], " ", outputs[model.predict([filtered_value])[0]])
        print("Ground truth: ", X_train['dialect_number'].iloc[index], " ", X_train['dialect'].iloc[index], "\n")

In [37]:
index = 11
test_ml(model= two_gram_svm, index= index, type_of_model= 'test')

من اجمل الاغاني اللي سمعتها هالايام 

🎹🎼🎵🎙
أنا كل ما نويت أنسى
لك الذكرى ترجعني
وترى للحين أنا أحبك
أشوفك بين حين وحين
فراقك آه يا فراقك
كسر قلبي وعذبني
وأنا نذر علي
أبقى أحبك لين يوم الدين
.
#انا_احبك 
#حسين_الجسمي 
@7sainaljassmi
@aL9aNe3 https://t.co/jEYV6GDosC
prediction:  17   BH
Ground truth:  17   BH 



## ML vs DL<a id='vs'></a>

In [17]:
def ml_vs_dl(index, ml_model, dl_model):
    filtered_value = X_test['pure_tweet'].iloc[index]
    print(X_test['tweet'].iloc[index])
    print("Ground truth: ", X_train['dialect'].iloc[index])
    #print(model.predict([filtered_value]))
    print("Ml predection: ", outputs[ml_model.predict([filtered_value])[0]])
    tw = tokenizer.texts_to_sequences([filtered_value])
    tw = pad_sequences(tw, maxlen=60)
    prediction = np.argmax(dl_model.predict(tw))
    print("dl predection: ", outputs[prediction])

In [18]:
index= 1
ml_vs_dl(index= index, ml_model= two_gram_svm, dl_model= best_model_from_Embeddding)

@razan_alshamali عشان اصورك صور حلوة اليوم 😜😜😂😂😂
Ground truth:  BH
Ml predection:  OM
dl predection:  PL


## ML vs DL for free text

In [25]:
def predict_free_text(ml_model, dl_model, text):
    processed_text = preprocess(text)
    print("machine learning: ", outputs[ml_model.predict([processed_text])[0]],'||', country_codes[outputs[ml_model.predict([processed_text])[0]]])
    tw = tokenizer.texts_to_sequences([processed_text])
    tw = pad_sequences(tw, maxlen=60)
    prediction = np.argmax(dl_model.predict(tw))
    print("deep learning: ",outputs[prediction],'||', country_codes[outputs[prediction]])

'from Data_pre_processing import preprocess\ndef predict_free_text(ml_model, dl_model, text):\n    processed_text = preprocess(text)\n    print("machine learning: ", outputs[ml_model.predict([processed_text])[0]])\n    tw = tokenizer.texts_to_sequences([processed_text])\n    tw = pad_sequences(tw, maxlen=60)\n    prediction = np.argmax(dl_model.predict(tw))\n    print("deep learning: ",outputs[prediction])'

In [26]:
# enter any text you want
predict_free_text(ml_model = two_gram_svm, dl_model= best_model_finetune_mazajak, text= "انا معرفش اسمك ايه")