**Importing libraries**

In [None]:
import numpy as np
import pandas as pd
import os
import re
import string
import pickle
from string import digits

**Reading datasets from google drive**

Shared drive link: https://drive.google.com/drive/folders/1P396v2CjxNOZbUV69mpZIgLM-qzz59Dn?usp=sharing

File will be found in shared with me, right click on the file and click add shortcut to drive.

Note that files are not shared in Github since the new created dataset is larger than 25MB.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/AIM tech')

Mounted at /content/drive


# Read cleaned dataset

In [None]:
data = pd.read_csv('data_cleaned.csv',lineterminator='\n')
data

Unnamed: 0.1,Unnamed: 0,id,dialect,text
0,0,1175358310087892992,IQ,"['نهايه', 'ينتفض', 'يغير']"
1,1,1175416117793349632,IQ,"['يعني', 'محسوب', 'علي', 'البشر', 'حيونه', 'وح..."
2,2,1175450108898565888,IQ,"['مبين', 'كلامه', 'خليجي']"
3,3,1175471073770573824,IQ,"['يسلملي', 'مرورك', 'وروحك', 'الحلوه']"
4,4,1175496913145217024,IQ,"['وين', 'الغيبه', 'اخ', 'محمد']"
...,...,...,...,...
458192,458192,1019484980282580992,BH,"['مبسوطين', 'منك', 'الي', 'باسطا']"
458193,458193,1021083283709407232,BH,"['ه', 'ماينده', 'ابش', 'يختي']"
458194,458194,1017477537889431552,BH,"['شو', 'عملنا', 'حنا', 'تهربي', 'منا', 'احنا',..."
458195,458195,1022430374696239232,BH,"['ه', 'يبارك', 'فيها', 'وبالعافيه']"


# Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

X = data['text']
y = data['dialect']

vectorizer = TfidfVectorizer(max_df=0.3, min_df=200,ngram_range=(1, 2), norm="l2")

#Save vectorizer    
X_vec = vectorizer.fit(X.astype('str'))
pickle.dump(X_vec, open('tfidf_vec.pkl', 'wb'))

X_vec

TfidfVectorizer(max_df=0.3, min_df=200, ngram_range=(1, 2))

In [None]:
X_vec= vectorizer.fit_transform(X.astype('str'))

In [None]:
from sklearn import preprocessing
le =preprocessing.LabelEncoder()
y = le.fit_transform(y)
y

array([4, 4, 4, ..., 1, 1, 1])

Train Test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train,y_test=train_test_split(X_vec,y,test_size=0.25,random_state=1,shuffle=True)

print(X_train.shape)
print(X_test.shape)

(343647, 3413)
(114550, 3413)


# Machine Learning Model

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
                                                                                          
clf = SVC()
#clf = RandomForestClassifier()

#clf = LinearSVC(random_state=1)
#clf = LogisticRegression(solver="sag", random_state=1)
#clf = RandomForestClassifier(random_state=1),
#clf = XGBClassifier(random_state=1),
#clf = MLPClassifier(random_state=1, solver="adam", hidden_layer_sizes=(12, 12, 12), activation="relu", early_stopping=True, n_iter_no_change=1)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Summarize results
print(classification_report(y_test, y_pred))

Save model

In [None]:
#Save model
import pickle
pickle.dump(clf, open('ml_model.pkl', 'wb'))

# Deep Learning Model

Building the model

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from numpy.random import seed

#setting seeds to get reproducible results
seed(42)
tf.random.set_seed(101)

model = keras.Sequential() 
model.add(layers.Dense(32, input_dim=3195))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
#model.add(layers.Dropout(0.5))

model.add(layers.Dense(32))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
#model.add(layers.Dropout(0.5))

model.add(layers.Dense(18, activation='softmax'))
#model.add(layers.Dense(1))
#model.add(layers.BatchNormalization()) 

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 32)                102272    
                                                                 
 batch_normalization_2 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 activation_2 (Activation)   (None, 32)                0         
                                                                 
 dense_4 (Dense)             (None, 32)                1056      
                                                                 
 batch_normalization_3 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 activation_3 (Activation)   (None, 32)               

Training the model

In [None]:
#one-hot encoding y
from keras.utils import np_utils
dummy_y = np_utils.to_categorical(y_train)

#setting seeds to get reproducible results
seed(42)
tf.random.set_seed(101)

#Changing epsilon value
tf.keras.backend.set_epsilon(1e-12)

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(X_train.astype('float16'), dummy_y,batch_size=12, epochs=20)

Epoch 1/20


  "shape. This may consume a large amount of memory." % value)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20

Evaluation

In [None]:
from sklearn.metrics import classification_report
from numpy import argmax
print(classification_report(y_test,argmax(model.predict(X_test.astype('float16')), axis=1) ))

              precision    recall  f1-score   support

           0       0.27      0.22      0.25      2908
           1       0.20      0.12      0.15      2917
           2       0.31      0.16      0.21      1699
           3       0.47      0.77      0.58      6226
           4       0.34      0.28      0.31      1751
           5       0.30      0.11      0.16      3045
           6       0.27      0.44      0.34      4628
           7       0.45      0.48      0.46      3016
           8       0.26      0.48      0.34      3991
           9       0.44      0.36      0.40      1254
          10       0.21      0.08      0.11      2079
          11       0.32      0.37      0.34      4747
          12       0.33      0.24      0.28      3367
          13       0.27      0.25      0.26      2902
          14       0.56      0.28      0.37      1529
          15       0.34      0.08      0.13      1791
          16       0.34      0.11      0.17      1038
          17       0.20    

Save the model

In [None]:
#Save model
import pickle
pickle.dump(model, open('DL_model.pkl', 'wb'))

INFO:tensorflow:Assets written to: ram://5a56fc92-1924-4f4c-858a-89318dc7f401/assets
