## Intent Classification and Prediction

In [2]:
import nltk
from nltk.stem.isri import ISRIStemmer
stemmer = ISRIStemmer()

import numpy as np
import tensorflow as tf
import pandas as pd
import random

  from ._conv import register_converters as _register_converters


In [3]:
import json
with open('intents.json', encoding="utf-8") as json_data:
    intents = json.load(json_data)

In [4]:
words = []
classes = []
documents = []
ignore_words = ['?']
for intent in intents['intents']:
    for pattern in intent['patterns']:
        w = nltk.word_tokenize(pattern)
        words.extend(w)
        documents.append((w, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)

34 documents
5 classes ['DateOfBirth', 'club', 'height', 'position', 'region']
21 unique stemmed words ['ادش', 'اعب', 'اي', 'ايم', 'اين', 'حفظ', 'دين', 'ركز', 'شو', 'طول', 'عاش', 'عمر', 'لعب', 'ما', 'من', 'ندي', 'هو', 'هي', 'ولد', 'وين', 'يلد']


In [5]:
training = []
output = []
output_empty = [0] * len(classes)

for doc in documents:
    bag = []
    pattern_words = doc[0]
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    training.append([bag, output_row])

random.shuffle(training)
training = np.array(training)

train_x = list(training[:,0])
train_y = list(training[:,1])

In [6]:
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)

In [7]:
print("Train X :\n",train_x[:5])
print("\nTrain Y :\n",train_y[:5])

Train X :
 [[0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0]
 [0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0]]

Train Y :
 [[0 0 0 1 0]
 [1 0 0 0 0]
 [0 0 0 0 1]
 [0 0 0 0 1]
 [1 0 0 0 0]]


In [8]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(50),solver='sgd',learning_rate_init=0.01,max_iter=1000)


In [9]:
mlp.fit(train_x, train_y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=50, learning_rate='constant',
       learning_rate_init=0.01, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [10]:
def clean_up_sentence(sentence):
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

def bow(sentence, words, show_details=False):
    sentence_words = clean_up_sentence(sentence)
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1

    return(np.array(bag))

In [11]:
context = {}

ERROR_THRESHOLD = 0.50
def classify(sentence):
    results = mlp.predict([bow(sentence, words)])[0]
    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD]
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append((classes[r[0]], r[1]))
    return return_list

def response(sentence):
    results = classify(sentence)
    if results:
        while results:
            for i in intents['intents']:
                if i['tag'] == results[0][0]:
                    print(i['tag'])

            return results

In [12]:
res = [
    "شئد طول احمد الصالح ؟",
    "خالد حاج عثمان وين ولد ؟",
    "اشو مركز حسين جنيد و بأي نادي بيلعب ؟"
]
for r in res:
    print("------------")
    print(response(r))

------------
height
[('height', 1)]
------------
region
[('region', 1)]
------------
club
[('club', 1), ('position', 1)]


## Named Entity Recognition Classifier 

In [13]:
from pathlib import Path
from datetime import datetime as dt
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.svm import LinearSVC

In [14]:
players = pd.read_excel('players.xlsx', encoding = "UTF-8")

In [15]:
players.head()

Unnamed: 0,إسم الاعب,الميلاد,المحافظة,الطول,مركز اللعب,النادي
0,إبراهيم عالمة,18-10-1991,حمص,185,حارس مرمى,الوحدة السوري
1,احمد مدنية,1-1-1990,الاذقية,177,حارس مرمى,الجيش السوري
2,خالد حاج عثمان,1-5-1987,حلب,185,حارس مرمى,ضمك السعودي
3,احمد الصالح,20-5-1998,القامشلي,183,مدافع,العهد اللبناني
4,حسين جويد,1-1-1993,حلب,178,مدافع,الزوراء العراقي


In [16]:
df = pd.read_csv('Football.txt', encoding = "UTF-8", sep=",", error_bad_lines=False)
df.head(10)

b'Skipping line 5410: expected 2 fields, saw 3\n'


Unnamed: 0,Word,Tag
0,كرة,O
1,القدم,O
2,رياضة,O
3,جماعية,O
4,،,O
5,يضم,O
6,فريق,O
7,كرة,O
8,القدم,O
9,11,O


In [17]:
for i in range(len(df.Tag)):
    df.Tag[i] = df.Tag[i].strip()

In [18]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size = 0.2)
train_arr = []
test_arr = []
train_lbl = []
test_lbl = []

In [19]:
train_arr=train['Word'].astype(str)
train_lbl=train['Tag'].astype(str)
test_arr=test['Word'].astype(str)
test_lbl=test['Tag'].astype(str)

In [20]:
print("Train labels :\n",train_lbl[15:20])
print("\nTrain Words :\n",train_arr[15:20])

Train labels :
 4056    O
3247    O
3259    O
2588    O
86      O
Name: Tag, dtype: object

Train Words :
 4056          ،
3247          و
3259    المنتخب
2588       أنها
86      الرياضة
Name: Word, dtype: object


In [21]:
vectorizer = CountVectorizer()
vectorizer.fit(train_arr)
train_mat = vectorizer.transform(train_arr)

In [22]:
tfidf = TfidfTransformer()
tfidf.fit(train_mat)
train_tfmat = tfidf.transform(train_mat)

In [23]:
test_mat = vectorizer.transform(test_arr)
test_tfmat = tfidf.transform(test_mat)

In [24]:
train_tfmat

<4536x1532 sparse matrix of type '<class 'numpy.float64'>'
	with 4027 stored elements in Compressed Sparse Row format>

In [25]:
lsvm=LinearSVC()
lsvm.fit(train_tfmat,train_lbl)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [26]:
y_pred_lsvm=lsvm.predict(test_tfmat)

In [27]:
test=['حلب']
test_str = vectorizer.transform(test)
test_tfstr = tfidf.transform(test_str)
test_tfstr.shape
lsvm.predict(test_tfstr.toarray())[0]

'B-reg'

In [28]:
from sklearn.metrics import  accuracy_score
from sklearn import metrics

In [29]:
print("accuracy:", metrics.accuracy_score(test_lbl, y_pred_lsvm))

accuracy: 0.9973568281938326


In [30]:
phrase="اشو مركز إبراهيم عالمة و بأي نادي بيلعب ؟"
arr=phrase.split()
print(arr)

import sys
y=[]
token=[]
for x in arr:
    x=[x]
    test_str = vectorizer.transform(x)
    test_tfstr = tfidf.transform(test_str)
    test_tfstr.shape
    token.append(x)
    y.append(lsvm.predict(test_tfstr.toarray())[0])

output =pd.DataFrame(list(zip(token,y)),columns=['token','entity_type'])

['اشو', 'مركز', 'إبراهيم', 'عالمة', 'و', 'بأي', 'نادي', 'بيلعب', '؟']


In [31]:
output

Unnamed: 0,token,entity_type
0,[اشو],O
1,[مركز],O
2,[إبراهيم],B-name
3,[عالمة],I-name
4,[و],O
5,[بأي],O
6,[نادي],O
7,[بيلعب],O
8,[؟],O


In [32]:
name = ''
org = ''
reg = ''
pos = ''
for i in range(len(output.loc[:,'entity_type'])):
    if output.loc[i,'entity_type'] == 'B-name' or output.loc[i,'entity_type']=='I-name':
        name = name + output.loc[i,'token'][0] + ' '
for i in range(len(output.loc[:,'entity_type'])):
    if output.loc[i,'entity_type'] == 'B-org' or output.loc[i,'entity_type']=='I-org':
        org = org + output.loc[i,'token'][0] + ' '
for i in range(len(output.loc[:,'entity_type'])):
    if output.loc[i,'entity_type'] == 'B-reg' or output.loc[i,'entity_type']=='I-reg':
        reg = reg + output.loc[i,'token'][0] + ' '
for i in range(len(output.loc[:,'entity_type'])):
    if output.loc[i,'entity_type'] == 'B-pos' or output.loc[i,'entity_type']=='I-pos':
        pos = pos + output.loc[i,'token'][0] + ' '

In [33]:
def Answer(players,intent):
    temp = players
    ans = []
    if name:
        temp = temp.set_index('إسم الاعب')
        ans.append(temp.loc[name.strip(),intent])
        return ans
    if org:
        heights = temp[temp['النادي'] == org.strip()].loc[:,intent]
        names = temp[temp['النادي'] == org.strip()].loc[:,'إسم الاعب']
        for n, h in zip(names, heights):
            ans.append(n + " " + str(h))
        return ans
    if reg:
        heights = temp[temp['المحافظة'] == reg.strip()].loc[:,intent]
        names = temp[temp['المحافظة'] == reg.strip()].loc[:,'إسم الاعب']
        for n, h in zip(names, heights):
            ans.append(n + " " + str(h))
        return ans
    if pos: 
        poss = pos.replace("ال", "")
        heights = temp[temp['مركز اللعب'] == poss.strip()].loc[:,intent]
        names = temp[temp['مركز اللعب'] == poss.strip()].loc[:,'إسم الاعب']
        for n, h in zip(names, heights):
            ans.append(n + " " + str(h))
        return ans

In [34]:
intentDict = {
    'height':'الطول',
    'region':'المحافظة',
    'club':'النادي',
    'DateOfBirth':'الميلاد',
    'position':'مركز اللعب'
}
ans = []
intent = response(phrase)
for i in range(len(intent)):
    ans.append(Answer(players,intentDict[intent[i][0]]))
print(ans)

club
[['الوحدة السوري'], ['حارس مرمى']]


## Sentence Generator with CFG rules

In [35]:
import nltk.parse.generate
import itertools
import sys
from nltk.grammar import Nonterminal

In [36]:
def generate(grammar, start=None, depth=None, n=None):
    """
    Generates an iterator of all sentences from a CFG.

    :param grammar: The Grammar used to generate sentences.
    :param start: The Nonterminal from which to start generate sentences.
    :param depth: The maximal depth of the generated tree.
    :param n: The maximum number of sentences to return.
    :return: An iterator of lists of terminal tokens.
    """
    if not start:
        start = grammar.start()
    if depth is None:
        depth = sys.maxsize

    iter = _generate_all(grammar, [start], depth)

    if n:
        iter = itertools.islice(iter, n)

    return iter



def _generate_all(grammar, items, depth):
    if items:
        try:
            for frag1 in _generate_one(grammar, items[0], depth):
                for frag2 in _generate_all(grammar, items[1:], depth):
                    yield frag1 + frag2
        except RuntimeError as _error:
            if _error.message == "maximum recursion depth exceeded":
                # Helpful error message while still showing the recursion stack.
                raise RuntimeError(
                    "The grammar has rule(s) that yield infinite recursion!!"
                )
            else:
                raise
    else:
        yield []


def _generate_one(grammar, item, depth):
    if depth > 0:
        if isinstance(item, Nonterminal):
            for prod in grammar.productions(lhs=item):
                for frag in _generate_all(grammar, prod.rhs(), depth - 1):
                    yield frag
        else:
            yield [item]

In [37]:
demo_grammar = """
  S -> NP
  NP -> MU KH
  MU -> 'مكان' 'ولادة' NS | 'طول' NS | 'نادي' NS | 'مركز' NS | 'تاريخ' 'ميلاد' NS
  NS -> 'اللاعب' 'فلان' 'الفلاني'
  KH ->  D
  D -> 'هو' 'تموضع'
"""
grammar = nltk.CFG.fromstring(demo_grammar)
sentence = "مكان ولادة اللاعب فلان الفلاني هو تموضع".split()
def parse(sent):
    #Returns nltk.Tree.Tree format output
    a = []  
    parser = nltk.ChartParser(grammar)
    for tree in parser.parse(sent):
        a.append(tree)
    return(a[0]) 
print(parse(sentence))
parse(sentence).draw()


(S (NP (MU مكان ولادة (NS اللاعب فلان الفلاني)) (KH (D هو تموضع))))


In [38]:
def generateSentences(N=10):
    from nltk.grammar import CFG
    responses = []
    print('Generating the sentences for demo grammar:')
    print(demo_grammar)
    grammar = CFG.fromstring(demo_grammar)
    for n, sent in enumerate(generate(grammar, n=10), 1):
        responses.append(' '.join(sent))
        print('%3d. %s' % (n, ' '.join(sent)))
    return responses

In [39]:
responses = generateSentences()

Generating the sentences for demo grammar:

  S -> NP
  NP -> MU KH
  MU -> 'مكان' 'ولادة' NS | 'طول' NS | 'نادي' NS | 'مركز' NS | 'تاريخ' 'ميلاد' NS
  NS -> 'اللاعب' 'فلان' 'الفلاني'
  KH ->  D
  D -> 'هو' 'تموضع'

  1. مكان ولادة اللاعب فلان الفلاني هو تموضع
  2. طول اللاعب فلان الفلاني هو تموضع
  3. نادي اللاعب فلان الفلاني هو تموضع
  4. مركز اللاعب فلان الفلاني هو تموضع
  5. تاريخ ميلاد اللاعب فلان الفلاني هو تموضع


In [40]:
responsesDict = {
    'region' : responses[0],
    'height' : responses[1],
    'club' : responses[2],
    'position' : responses[3],
    'DateOfBirth' : responses[4]
}

In [41]:
for i in range(len(intent)):
    s = responsesDict[intent[i][0]]
    s = s.replace("تموضع",ans[i][0])
    s = s.replace("فلان الفلاني",name)
    print(s)


نادي اللاعب إبراهيم عالمة  هو الوحدة السوري
مركز اللاعب إبراهيم عالمة  هو حارس مرمى
