# Arabic Question Classification 
### Implement Arabic questions classification using Support Vector Machine and Naive Bayes models. The data set consists of 1645 questions, splited into 2 classes (color and yes/no).
 

### Import necessary libraries

In [1]:
import gensim
import nltk
from nltk import ngrams
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

### Load dataset

In [4]:
path_to_file = "Questions2.csv" 
data = pd.read_csv(path_to_file, encoding = 'utf8') # load the dataset

X = data.drop('Class', axis = 1) # removing the last column 
Y = data['Class'] # adding the last column

# splitting the dataset to the training set and testing set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size = 0.8,random_state=2 )

### Load Word2Vec model 

In [72]:
word2vec_model = gensim.models.Word2Vec.load('full_grams_cbow_100_wiki.mdl') # load word embedding model

### This function to get word vector for input question using "full_grams_cbow_100_wiki" model

In [73]:
def get_vec(n_model,dim, token):
    vec = np.zeros(dim) # initialization a list with zeros 
    is_vec = False
    if token not in n_model.wv: 
        _count = 0
        is_vec = True
        for w in token.split(" "):
            if w in n_model.wv:
                _count += 1
                vec += n_model.wv[w]
        if _count > 0:
            vec = vec / _count
    else:
        vec = n_model.wv[token]
    return vec

### Pre-processing steps:
- Question to tokens
- Tokens to vectors (using Arabic words embedding)
- Merge the words vectors together
- Padding the question vector to the maximum question length

In [74]:

# train features
question_train = [] #list to store training questions' vector 
for i in range(len(X_train)):
    token = nltk.word_tokenize(X_train['Question'].values[i]) # questions to tokens 
    
    token_vector = [] #list to store tokens vector 
    for i in range(len(token)):
        token_vector.append(get_vec(word2vec_model,100,token[i])) # get word vector using pre-trained model
    
    token_vector = np.array(token_vector) # convert token vector list to array 
    token_vector = np.reshape(token_vector, (1, len(token_vector)*100)) # reshape array to 1 and token vector lenght
    token_vector = np.pad(token_vector, ((0,0),(0,(1000 - token_vector.shape[1]))), 'constant') # paadding vector 
    question_train.append(token_vector) # add the token vector to the question list

question_train = np.array(question_train) # convert question list to array 
question_train = np.reshape(question_train,(question_train.shape[0],question_train.shape[2])) # reshape the array to 2d
  

    
# test features
question_test = [] #list to store testing questions' vector 
for i in range(len(X_test)):
    token = nltk.word_tokenize(X_test['Question'].values[i]) # questions to tokens 
   
    token_vector_test = [] #list to store tokens vector 
    for i in range(len(token)):
        token_vector_test.append(get_vec(word2vec_model,100,token[i])) # get word vector using pre-trained model
        
    token_vector_test = np.array(token_vector_test)# convert token vector list to array 
    token_vector_test = np.reshape(token_vector_test, (1, len(token_vector_test)*100))# reshape array to 1 and token vector lenght
    token_vector_test = np.pad(token_vector_test, ((0,0),(0,(1000 - token_vector_test.shape[1]))), 'constant') # paadding vector 
    question_test.append(token_vector_test) # add the token vector to the question list
    
question_test = np.array(question_test) # convert question list to array
question_test = np.reshape(question_test,(question_test.shape[0],question_test.shape[2])) # reshape the array to 2d


## Training Models
### SVM (Support Vector Machine)

In [75]:
from sklearn import svm
clf = svm.SVC(gamma='auto')
clf.fit(question_train, Y_train) # train the model 
print("Score (SVM): {:.2f}".format(clf.score(question_test, Y_test)*100)) # calculate accuracy 

Score (SVM): 99.70


### Naive Bayes

In [69]:
from sklearn.naive_bayes import GaussianNB
clf2 = GaussianNB()
clf2.fit(question_train, Y_train) # train the model 
print("Score (Naive Bayes): {:.2f}".format(clf2.score(question_test, Y_test)*100)) # calculate accuracy 

Score (Naive Bayes): 100.00
