In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import  confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

# X-Vectore 

In [8]:
def report_classifer(y_test,predicts,target_names):
    print("-----------------------------")
    print("Report Classifier")
    cm=confusion_matrix(y_test,predicts,labels=target_names)
    print(cm)
    print(classification_report(y_test, predicts, target_names=target_names))
    print("Report Classifier")
    print("-----------------------------")

In [9]:
def read_data_set(csv_vectore,number_feature=513):
    vdf=pd.read_csv(csv_vectore,header=None)
    columns=[ "f{}".format(i) for i in range(1,number_feature)]
    columns.append('class')
    vdf.columns=columns
    features=vdf.iloc[:,0:-1]
    dialects=vdf.iloc[:,-1]
    return features,dialects

In [10]:
features,dialects=read_data_set('kurdish_dialect_vectors/final_kurdish_dialect_xvectors.csv')



In [11]:

print(features.shape)
print(dialects.shape)

(16384, 512)
(16384,)


In [12]:
from sklearn.svm import SVC

#clf = svm.SVC(kernel='linear', C=1, random_state=42)

# def SVM_classifier(features,label):
#     model=SVC(gamma='auto',random_state=42)
#     model.fit(features,label)
#     return model

In [34]:
metrics=['precision_macro', 'recall_macro','f1_macro','accuracy']


In [44]:
clf = SVC(gamma='auto',random_state=42)
scores_csv = cross_validate(clf, features, dialects, cv=5,verbose=1,scoring=metrics)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.9min finished


In [45]:
scores_csv

{'fit_time': array([21.72644949, 27.56094241, 25.33736396, 27.90252423, 26.96728992]),
 'score_time': array([8.25180578, 9.53459048, 9.04793429, 9.07903266, 9.20807791]),
 'test_precision_macro': array([0.75629476, 0.92402513, 0.84106935, 0.91252354, 0.92649355]),
 'test_recall_macro': array([0.77174641, 0.88604648, 0.79784286, 0.90938627, 0.92513816]),
 'test_f1_macro': array([0.75039506, 0.89693276, 0.80596344, 0.90883399, 0.92552472]),
 'test_accuracy': array([0.7549588 , 0.91211474, 0.83307904, 0.90357034, 0.92246642])}

In [46]:
print("Precision on CSV: ", round(scores_csv["test_precision_macro"].mean(),2))
print("Recall on CSV : ", round(scores_csv["test_recall_macro"].mean(),2))
print("F1_score  on CSV: ", round(scores_csv["test_f1_macro"].mean(),2))
print("Acuracy  on CSV: ", round(scores_csv["test_accuracy"].mean(),2))

Precision on CSV:  0.87
Recall on CSV :  0.86
F1_score  on CSV:  0.86
Acuracy  on CSV:  0.87


In [48]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_csv["test_precision_macro"].mean(), scores_csv["test_precision_macro"].std()))


0.87 accuracy with a standard deviation of 0.07


# Evaluation On Decision Tree

In [36]:
from sklearn.tree import DecisionTreeClassifier

In [49]:
dt = DecisionTreeClassifier(criterion='entropy',random_state=42)
scores_dt = cross_validate(dt, features, dialects, cv=5,verbose=1,scoring=metrics)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.6min finished


In [52]:
scores_dt

{'fit_time': array([19.04834366, 18.94447446, 19.00083542, 19.93650889, 18.37455583]),
 'score_time': array([0.04562736, 0.04521155, 0.04608893, 0.04596949, 0.04543066]),
 'test_precision_macro': array([0.55347717, 0.63650537, 0.63393178, 0.70664886, 0.70429903]),
 'test_recall_macro': array([0.56215631, 0.61529657, 0.6118164 , 0.69584829, 0.70414667]),
 'test_f1_macro': array([0.55395708, 0.61896913, 0.61715293, 0.69558588, 0.70361734]),
 'test_accuracy': array([0.5627098 , 0.64418676, 0.6396094 , 0.69209643, 0.7014652 ])}

In [53]:
print("Precision on DT: ", round(scores_dt["test_precision_macro"].mean(),2))
print("Recall on DT: ", round(scores_dt["test_recall_macro"].mean(),2))
print("F1_score on DT: ", round(scores_dt["test_f1_macro"].mean(),2))
print("Acuracy on DT: ", round(scores_dt["test_accuracy"].mean(),2))

Precision on DT:  0.65
Recall on DT:  0.64
F1_score on DT:  0.64
Acuracy on DT:  0.65


0.65

------------------------------

In [51]:
k_fold_split=5
labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=k_fold_split, random_state=42, shuffle=True)



In [45]:


model=SVC(gamma='auto',random_state=42)

fold_no = 0
results={}
scores=[0]*k_fold_split 
for train_index, test_index in skf.split(features, dialects):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = dialects.iloc[train_index], dialects.iloc[test_index]
    
    print("Fold: ",str(fold_no))
    
    model.fit(X_train,y_train)
    predicts = model.predict(X_test)
    
    scores[fold_no]= accuracy_score(y_test,predicts) 
    print('Accuracy: ',scores[fold_no])
    report_classifer(y_test,predicts,labels)
    
    fold_no = fold_no + 1

Fold:  0
Accuracy:  0.9893134715025906
-----------------------------
Report Classifier
[[335   0   0   0   0]
 [  1 692   0   0   0]
 [  0   0 716   0   0]
 [  0   0   1 664   7]
 [  0   2   6  16 648]]
              precision    recall  f1-score   support

     hawrami       1.00      1.00      1.00       335
     kalhori       1.00      1.00      1.00       693
      zazaki       0.99      0.96      0.98       672
      sorani       0.98      0.99      0.98       672
    kurmanji       0.99      1.00      1.00       716

    accuracy                           0.99      3088
   macro avg       0.99      0.99      0.99      3088
weighted avg       0.99      0.99      0.99      3088

Report Classifier
-----------------------------
Fold:  1
Accuracy:  0.9896373056994818
-----------------------------
Report Classifier
[[330   0   0   1   4]
 [  0 693   0   0   0]
 [  0   0 716   0   0]
 [  0   0   0 660  12]
 [  0   1   3  11 657]]
              precision    recall  f1-score   support

  

In [46]:
import numpy as np
print("%0.2f accuracy with a standard deviation of %0.2f" % (np.array(scores).mean(), np.array(scores).std()))

0.99 accuracy with a standard deviation of 0.00


In [48]:
model

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

# Decicio Tree Classifier

In [49]:
from sklearn.tree import DecisionTreeClassifier

In [53]:

clf = DecisionTreeClassifier(criterion='entropy',random_state=42)
scores = cross_val_score(clf, features, dialects, cv=5)

print("%0.2f accuracy with a standard deviation of %0.2f" % (np.array(scores).mean(), np.array(scores).std()))

0.66 accuracy with a standard deviation of 0.03


In [54]:
model = DecisionTreeClassifier(criterion='entropy',random_state=42)

fold_no = 0
results={}
scores=[0]*k_fold_split 
for train_index, test_index in skf.split(features, dialects):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = dialects.iloc[train_index], dialects.iloc[test_index]
    
    print("Fold: ",str(fold_no))
    
    model.fit(X_train,y_train)
    predicts = model.predict(X_test)
    
    scores[fold_no]= accuracy_score(y_test,predicts) 
    print('Accuracy: ',scores[fold_no])
    report_classifer(y_test,predicts,labels)
    
    fold_no = fold_no + 1

Fold:  0
Accuracy:  0.8539507772020726
-----------------------------
Report Classifier
[[288  14  12  11  10]
 [  6 630  26  12  19]
 [ 11  18 646  16  25]
 [ 11  20  16 548  77]
 [ 10  25  35  77 525]]
              precision    recall  f1-score   support

     hawrami       0.88      0.86      0.87       335
     kalhori       0.89      0.91      0.90       693
      zazaki       0.80      0.78      0.79       672
      sorani       0.83      0.82      0.82       672
    kurmanji       0.88      0.90      0.89       716

    accuracy                           0.85      3088
   macro avg       0.86      0.85      0.85      3088
weighted avg       0.85      0.85      0.85      3088

Report Classifier
-----------------------------
Fold:  1
Accuracy:  0.8448834196891192
-----------------------------
Report Classifier
[[291  13   7   5  19]
 [ 11 616  24  26  16]
 [  7  30 637  17  25]
 [  7  21  13 543  88]
 [ 11  25  32  82 522]]
              precision    recall  f1-score   support

  

In [55]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (np.array(scores).mean(), np.array(scores).std()))

0.84 accuracy with a standard deviation of 0.01


-------------------

# The following result is not to the final data

In [None]:
model_xvectore_1=SVM_classifier(X_train,y_train)


acc=model_xvectore_1.score(X_test,y_test)
print("score: ",acc)
predicts=model_xvectore_1.predict(X_test)

labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]
labels=[1, 2, 3,4,5]
cm=confusion_matrix(y_test,predicts,labels=labels)
print(cm)
report_classifer(y_test,predicts,labels)

In [31]:
model_xvectore_1

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

--------------------------

In [17]:
features_xvectore_test_set,dialects_xvectore_test_set=read_data_set("data/test_set_kurdish_dialect_xvector.csv")

In [18]:
features.shape

(1239, 512)

In [19]:

predicts=model_xvectore_1.predict(features_xvectore_test_set)

labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]
#labels=[1, 2, 3,4,5]
# cm=confusion_matrix(y_test,predicts,labels=labels)
# print(cm)
report_classifer(dialects_xvectore_test_set,predicts,labels)

-----------------------------
Report Classifier
[[  6  10  27   6 176]
 [  6  95   8   8 122]
 [  4   5 195   2  64]
 [  9  37  15  51 138]
 [  2  55  56  17 125]]
              precision    recall  f1-score   support

     hawrami       0.22      0.03      0.05       225
     kalhori       0.47      0.40      0.43       239
      zazaki       0.20      0.49      0.28       255
      sorani       0.61      0.20      0.31       250
    kurmanji       0.65      0.72      0.68       270

    accuracy                           0.38      1239
   macro avg       0.43      0.37      0.35      1239
weighted avg       0.44      0.38      0.36      1239

Report Classifier
-----------------------------


In [51]:
import pickle

def save_model(model,model_name):
    with open(model_name,"wb") as f:
        f.write(pickle.dumps( model ))

In [63]:
save_model(model_xvectore_1,"kdi_svm_xvector_trained_on_only_train_set.pkl")

In [22]:
#train model on test set only

In [27]:
features2,dialects2=read_data_set("data/test_set_kurdish_dialect_xvector.csv")
X_train2, X_test2, y_train2, y_test2 = train_test_split(features2,dialects2,test_size=0.2,shuffle=True,random_state=12,stratify=dialects2)


In [28]:
features1.shape

(1239, 512)

In [29]:
model_xvectore_2=SVM_classifier(X_train2,y_train2)


acc=model_xvectore_2.score(X_test2,y_test2)
print("score: ",acc)
predicts=model_xvectore_2.predict(X_test2)

labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]
#labels=[1, 2, 3,4,5]
# cm=confusion_matrix(y_test,predicts,labels=labels)
# print(cm)
report_classifer(y_test2,predicts,labels)

score:  0.9637096774193549
-----------------------------
Report Classifier
[[45  0  0  0  0]
 [ 0 44  1  2  1]
 [ 0  0 54  0  0]
 [ 1  0  0 49  0]
 [ 0  0  0  4 47]]
              precision    recall  f1-score   support

     hawrami       0.98      1.00      0.99        45
     kalhori       1.00      0.92      0.96        48
      zazaki       0.98      0.92      0.95        51
      sorani       0.89      0.98      0.93        50
    kurmanji       0.98      1.00      0.99        54

    accuracy                           0.96       248
   macro avg       0.97      0.96      0.96       248
weighted avg       0.97      0.96      0.96       248

Report Classifier
-----------------------------


In [30]:
model_xvectore_2

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [61]:
save_model(model_xvectore_2,"kdi_svm_xvector_trained_on_test_set.pkl")

--------------------------

In [56]:
#tran on combined train and test set

In [33]:
import numpy as np

In [39]:
features.shape

(21125, 512)

In [42]:
features_xvectore_test_set.shape

(1239, 512)

In [40]:
concated_features=np.concatenate ((features,features_xvectore_test_set),axis=0)

In [43]:
concated_dialects=np.concatenate ((dialects,dialects_xvectore_test_set),axis=0)

In [44]:
concated_features.shape

(22364, 512)

In [45]:
concated_dialects.shape

(22364,)

In [47]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(concated_features,concated_dialects,test_size=0.2,shuffle=True,random_state=12,stratify=concated_dialects)


In [57]:
model_xvectore_combined=SVM_classifier(X_train3,y_train3)


acc=model_xvectore_combined.score(X_test3,y_test3)
print("score: ",acc)
predicts=model_xvectore_combined.predict(X_test3)

labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]
#labels=[1, 2, 3,4,5]
# cm=confusion_matrix(y_test,predicts,labels=labels)
# print(cm)
report_classifer(y_test3,predicts,labels)

score:  0.9827856025039123
-----------------------------
Report Classifier
[[ 381    3    1    0    2]
 [   0  989    1    3    2]
 [   0    0  998    0    5]
 [   4    3    0  851   35]
 [   1    1    8    8 1177]]
              precision    recall  f1-score   support

     hawrami       0.99      0.98      0.99       387
     kalhori       0.99      0.99      0.99       995
      zazaki       0.96      0.98      0.97      1195
      sorani       0.99      0.95      0.97       893
    kurmanji       0.99      1.00      0.99      1003

    accuracy                           0.98      4473
   macro avg       0.98      0.98      0.98      4473
weighted avg       0.98      0.98      0.98      4473

Report Classifier
-----------------------------


In [58]:
model_xvectore_combined

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [64]:
save_model(model_xvectore_combined,"kdi_svm_xvector_trained_on_combinded_dataset.pkl")