# Import Lib

In [1]:
from pathlib import Path
from datetime import datetime as dt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.svm import LinearSVC
import pickle
from sklearn.metrics import  accuracy_score
from sklearn import metrics
import sys

# Load Data

In [2]:
path="ANERCorp.xlsx"

In [3]:
xlsx = pd.ExcelFile(path)

In [4]:
df=pd.read_excel(xlsx, header=None)

In [5]:
df

Unnamed: 0,0,1,2
0,فرانكفورت,B-LOC,
1,(د,O,
2,ب,O,
3,أ),O,
4,أعلن,O,
...,...,...,...
150281,الشخصية,O,
150282,للوليد,B-PERS,
150283,بن,I-PERS,
150284,طلال,I-PERS,


In [6]:
df = df.drop(2, 1)

In [7]:
df = df.rename(columns={0: 'text', 1: 'label'})

In [8]:
df

Unnamed: 0,text,label
0,فرانكفورت,B-LOC
1,(د,O
2,ب,O
3,أ),O
4,أعلن,O
...,...,...
150281,الشخصية,O
150282,للوليد,B-PERS
150283,بن,I-PERS
150284,طلال,I-PERS


In [9]:
train, test = train_test_split(df, test_size = 0.2)
train_arr = []
test_arr = []
train_lbl = []
test_lbl = []

In [10]:
train_arr=train['text'].astype(str)
train_lbl=train['label'].astype(str)
test_arr=test['text'].astype(str)
test_lbl=test['label'].astype(str)

# Data Preprocessing

In [11]:
vectorizer = CountVectorizer()
vectorizer.fit(train_arr)
train_mat = vectorizer.transform(train_arr)

In [12]:
tfidf = TfidfTransformer()
tfidf.fit(train_mat)
train_tfmat = tfidf.transform(train_mat)


In [13]:
test_mat = vectorizer.transform(test_arr)
test_tfmat = tfidf.transform(test_mat)


In [14]:
del df
del test_arr
del train_arr

# Data Modeling

In [17]:
train_tfmat

<120228x28249 sparse matrix of type '<class 'numpy.float64'>'
	with 107836 stored elements in Compressed Sparse Row format>

In [15]:
lsvm=LinearSVC()
lsvm.fit(train_tfmat,train_lbl)


LinearSVC()

In [16]:
y_pred_lsvm=lsvm.predict(test_tfmat)

## Save Model

In [49]:
filename = 'ner_model.sav'
pickle.dump((lsvm,vectorizer,tfidf), open(filename, 'wb'))

## Load Model

In [53]:
lsvm,vectorizer,tfidf = pickle.load(open(filename, 'rb'))

# Evaluation Score

In [19]:
print("accuracy:", metrics.accuracy_score(test_lbl, y_pred_lsvm))

accuracy: 0.9365227227360436


# Testing 

In [20]:
test=['ألمانيا']
test_str = vectorizer.transform(test)
test_tfstr = tfidf.transform(test_str)
test_tfstr.shape
lsvm.predict(test_tfstr.toarray())[0]

'B-LOC'

In [26]:
phrase="ذهب احمد الي الفصل"
arr=phrase.split()

In [27]:
arr

['ذهب', 'احمد', 'الي', 'الفصل']

In [54]:
y=[]
token=[]
for x in arr:
    x=[x]
    test_str = vectorizer.transform(x)
    test_tfstr = tfidf.transform(test_str)
    test_tfstr.shape
    token.append(x)
    y.append(lsvm.predict(test_tfstr.toarray())[0])

In [55]:
df=pd.DataFrame(list(zip(token,y)),columns=['token','entity_type'])

In [56]:
df

Unnamed: 0,token,entity_type
0,[ذهب],B-MISC
1,[احمد],B-PERS
2,[الي],O
3,[الفصل],O


In [47]:
res = {}
for key in range(len(token)):
    res[token[key][0]] = y[key]
        

In [48]:
res

{'ذهب': 'B-MISC', 'احمد': 'B-PERS', 'الي': 'O', 'الفصل': 'O'}