In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import  confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

# i-Vector

In [2]:
def report_classifer(y_test,predicts,target_names):
    print("-----------------------------")
    print("Report Classifier")
    cm=confusion_matrix(y_test,predicts,labels=target_names)
    print(cm)
    print(classification_report(y_test, predicts, target_names=target_names))
    print("Report Classifier")
    print("-----------------------------")

In [3]:
def read_data_set(csv_vectore,number_feature=401):
    vdf=pd.read_csv(csv_vectore,header=None)
    columns=[ "f{}".format(i) for i in range(1,number_feature)]
    columns.append('class')
    vdf.columns=columns
    features=vdf.iloc[:,0:-1]
    dialects=vdf.iloc[:,-1]
    return features,dialects

In [4]:
features,dialects=read_data_set("kurdish_dialect_vectors/i-vectors.csv")


In [5]:
metrics=['precision_macro', 'recall_macro','f1_macro','accuracy']

# Evaluation on SVM

In [7]:
from sklearn.svm import SVC

# def SVM_classifier(features,label):
#     model=SVC(gamma='auto')
#     model.fit(features,label)
#     return model

In [8]:
metrics=['precision_macro', 'recall_macro','f1_macro','accuracy']


In [9]:
clf = SVC(gamma='auto',random_state=42)
scores_csv = cross_validate(clf, features, dialects, cv=5,verbose=1,scoring=metrics)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  9.0min finished


In [10]:
scores_csv

{'fit_time': array([78.73017287, 92.51534653, 90.58196449, 93.01711178, 93.15384197]),
 'score_time': array([17.08285689, 18.6905272 , 17.77322221, 18.90628147, 18.34829855]),
 'test_precision_macro': array([0.75915324, 0.88422508, 0.8518159 , 0.93785604, 0.93075817]),
 'test_recall_macro': array([0.68435873, 0.81753112, 0.78314302, 0.93001267, 0.92212733]),
 'test_f1_macro': array([0.68672608, 0.82591334, 0.79534975, 0.93355808, 0.92595182]),
 'test_accuracy': array([0.68355203, 0.86176381, 0.81660055, 0.9337809 , 0.92612943])}

In [11]:
print("Precision on CSV: ", round(scores_csv["test_precision_macro"].mean(),2))
print("Recall on CSV : ", round(scores_csv["test_recall_macro"].mean(),2))
print("F1_score  on CSV: ", round(scores_csv["test_f1_macro"].mean(),2))
print("Acuracy  on CSV: ", round(scores_csv["test_accuracy"].mean(),2))

Precision on CSV:  0.87
Recall on CSV :  0.83
F1_score  on CSV:  0.83
Acuracy  on CSV:  0.84


In [12]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_csv["test_precision_macro"].mean(), scores_csv["test_precision_macro"].std()))


0.87 accuracy with a standard deviation of 0.06


# Evaluation On Decision Tree


In [13]:
from sklearn.tree import DecisionTreeClassifier

In [14]:
dt = DecisionTreeClassifier(criterion='entropy',random_state=42)
scores_dt = cross_validate(dt, features, dialects, cv=5,verbose=1,scoring=metrics)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished


In [17]:
scores_dt

{'fit_time': array([14.48237181, 14.27434635, 15.73906684, 15.211339  , 15.19518948]),
 'score_time': array([0.04507136, 0.04196763, 0.04385638, 0.04385805, 0.0437808 ]),
 'test_precision_macro': array([0.55695788, 0.66442563, 0.60559676, 0.76176744, 0.76699534]),
 'test_recall_macro': array([0.55768373, 0.63350734, 0.59043466, 0.75761609, 0.75522425]),
 'test_f1_macro': array([0.55412858, 0.63716984, 0.59201302, 0.75800884, 0.75872793]),
 'test_accuracy': array([0.55660665, 0.66859933, 0.61641745, 0.75434849, 0.75030525])}

In [19]:
print("Precision on DT: ", round(scores_dt["test_precision_macro"].mean(),2))
print("Recall on DT: ", round(scores_dt["test_recall_macro"].mean(),2))
print("F1_score on DT: ", round(scores_dt["test_f1_macro"].mean(),2))
print("Acuracy on DT: ", round(scores_dt["test_accuracy"].mean(),2))

Precision on DT:  0.67
Recall on DT:  0.66
F1_score on DT:  0.66
Acuracy on DT:  0.67


-------------------

In [None]:
k_fold_split=5
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=k_fold_split, random_state=42, shuffle=True)

labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]

In [10]:
model=SVC(gamma='auto',random_state=42)

fold_no = 0
results={}
scores=[0]*k_fold_split 
for train_index, test_index in skf.split(features, dialects):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = dialects.iloc[train_index], dialects.iloc[test_index]
    
    print("Fold: ",str(fold_no))
    
    model.fit(X_train,y_train)
    predicts = model.predict(X_test)
    
    scores[fold_no]= accuracy_score(y_test,predicts) 
    print('Accuracy: ',scores[fold_no])
    report_classifer(y_test,predicts,labels)
    
    fold_no = fold_no + 1

Fold:  0
Accuracy:  0.9802461139896373
-----------------------------
Report Classifier
[[323   1   3   7   1]
 [  0 689   4   0   0]
 [  0   0 715   1   0]
 [  0   0   5 660   7]
 [  1   2  16  13 640]]
              precision    recall  f1-score   support

     hawrami       1.00      0.96      0.98       335
     kalhori       1.00      0.99      0.99       693
      zazaki       0.99      0.95      0.97       672
      sorani       0.97      0.98      0.98       672
    kurmanji       0.96      1.00      0.98       716

    accuracy                           0.98      3088
   macro avg       0.98      0.98      0.98      3088
weighted avg       0.98      0.98      0.98      3088

Report Classifier
-----------------------------
Fold:  1
Accuracy:  0.979598445595855
-----------------------------
Report Classifier
[[325   2   5   2   1]
 [  0 688   3   1   1]
 [  0   0 715   0   1]
 [  0   0   6 657   9]
 [  0   1  20  11 640]]
              precision    recall  f1-score   support

   

In [11]:
import numpy as np
print("%0.2f accuracy with a standard deviation of %0.2f" % (np.array(scores).mean(), np.array(scores).std()))

0.98 accuracy with a standard deviation of 0.00


# Decicio Tree Classifier

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
clf = DecisionTreeClassifier(criterion='entropy',random_state=42)
scores = cross_val_score(clf, features, dialects, cv=5)

print("%0.2f accuracy with a standard deviation of %0.2f" % (np.array(scores).mean(), np.array(scores).std()))

0.70 accuracy with a standard deviation of 0.03


In [14]:
model = DecisionTreeClassifier(criterion='entropy',random_state=42)

fold_no = 0
results={}
scores=[0]*k_fold_split 
for train_index, test_index in skf.split(features, dialects):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = dialects.iloc[train_index], dialects.iloc[test_index]
    
    print("Fold: ",str(fold_no))
    
    model.fit(X_train,y_train)
    predicts = model.predict(X_test)
    
    scores[fold_no]= accuracy_score(y_test,predicts) 
    print('Accuracy: ',scores[fold_no])
    report_classifer(y_test,predicts,labels)
    
    fold_no = fold_no + 1

Fold:  0
Accuracy:  0.8380829015544041
-----------------------------
Report Classifier
[[295  17   1   7  15]
 [ 10 625  22  27   9]
 [  5  15 650  27  19]
 [ 10  21  16 516 109]
 [ 20  14  28 108 502]]
              precision    recall  f1-score   support

     hawrami       0.87      0.88      0.87       335
     kalhori       0.90      0.90      0.90       693
      zazaki       0.77      0.75      0.76       672
      sorani       0.75      0.77      0.76       672
    kurmanji       0.91      0.91      0.91       716

    accuracy                           0.84      3088
   macro avg       0.84      0.84      0.84      3088
weighted avg       0.84      0.84      0.84      3088

Report Classifier
-----------------------------
Fold:  1
Accuracy:  0.8403497409326425
-----------------------------
Report Classifier
[[278  10  10  15  22]
 [ 13 627  14  20  19]
 [  6  19 653  17  21]
 [ 15  17  30 509 101]
 [ 13  11  27  93 528]]
              precision    recall  f1-score   support

  

In [15]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (np.array(scores).mean(), np.array(scores).std()))

0.85 accuracy with a standard deviation of 0.01


--------------------------------

# the following result is not to the final data

In [8]:
model_ivectore_1=SVM_classifier(X_train,y_train)


acc=model_ivectore_1.score(X_test,y_test)
print("score: ",acc)
predicts=model_ivectore_1.predict(X_test)

labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]
#labels=[1, 2, 3,4,5]
# cm=confusion_matrix(y_test,predicts,labels=labels)
# print(cm)
report_classifer(y_test,predicts,labels)

score:  0.9666896393291983
-----------------------------
Report Classifier
[[ 333    0    3    1    6]
 [   0  976    2    1    6]
 [   0    5  953    0   21]
 [   0    2    5  801   39]
 [   2    9   25   18 1145]]
              precision    recall  f1-score   support

     hawrami       0.99      0.97      0.98       343
     kalhori       0.98      0.99      0.99       985
      zazaki       0.94      0.95      0.95      1199
      sorani       0.98      0.95      0.96       847
    kurmanji       0.96      0.97      0.97       979

    accuracy                           0.97      4353
   macro avg       0.97      0.97      0.97      4353
weighted avg       0.97      0.97      0.97      4353

Report Classifier
-----------------------------


In [10]:
model_ivectore_1

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

--------------------------

In [15]:
features_ivectore_test_set,dialects_ivectore_test_set=read_data_set("data/test_set_kurdish_dialects_ivectors.csv",401)

In [17]:
features_ivectore_test_set.shape

(1239, 400)

In [18]:

predicts=model_ivectore_1.predict(features_ivectore_test_set)

labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]
#labels=[1, 2, 3,4,5]
# cm=confusion_matrix(y_test,predicts,labels=labels)
# print(cm)
report_classifer(dialects_ivectore_test_set,predicts,labels)

-----------------------------
Report Classifier
[[  4  20  50  46 105]
 [ 10  86  17  63  63]
 [  7  51 105  47  60]
 [  5  18  45  37 145]
 [  7  49  72  35  92]]
              precision    recall  f1-score   support

     hawrami       0.12      0.02      0.03       225
     kalhori       0.38      0.36      0.37       239
      zazaki       0.20      0.36      0.26       255
      sorani       0.16      0.15      0.15       250
    kurmanji       0.36      0.39      0.38       270

    accuracy                           0.26      1239
   macro avg       0.25      0.26      0.24      1239
weighted avg       0.25      0.26      0.24      1239

Report Classifier
-----------------------------


In [21]:
import pickle

def save_model(model,model_name):
    with open(model_name,"wb") as f:
        f.write(pickle.dumps( model ))

In [22]:
save_model(model_ivectore_1,"kdi_svm_ivector_trained_on_only_train_set.pkl")

In [22]:
#train model on test set only

In [29]:
features2,dialects2=read_data_set("data/test_set_kurdish_dialects_ivectors.csv")
X_train2, X_test2, y_train2, y_test2 = train_test_split(features2,dialects2,test_size=0.2,shuffle=True,random_state=12,stratify=dialects2)


In [30]:
features2.shape

(1239, 400)

In [31]:
model_ivectore_2=SVM_classifier(X_train2,y_train2)


acc=model_ivectore_2.score(X_test2,y_test2)
print("score: ",acc)
predicts=model_ivectore_2.predict(X_test2)

labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]
#labels=[1, 2, 3,4,5]
# cm=confusion_matrix(y_test,predicts,labels=labels)
# print(cm)
report_classifer(y_test2,predicts,labels)

score:  0.6290322580645161
-----------------------------
Report Classifier
[[19  4 11  5  6]
 [ 3 27  5  9  4]
 [ 1  3 46  2  2]
 [ 6  4  8 30  2]
 [ 2  2  9  4 34]]
              precision    recall  f1-score   support

     hawrami       0.61      0.42      0.50        45
     kalhori       0.68      0.56      0.61        48
      zazaki       0.71      0.67      0.69        51
      sorani       0.60      0.60      0.60        50
    kurmanji       0.58      0.85      0.69        54

    accuracy                           0.63       248
   macro avg       0.64      0.62      0.62       248
weighted avg       0.64      0.63      0.62       248

Report Classifier
-----------------------------


In [32]:
model_ivectore_2

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [34]:
save_model(model_ivectore_2,"kdi_svm_ivector_trained_on_test_set.pkl")

--------------------------

In [56]:
#tran on combined train and test set

In [35]:
import numpy as np

In [40]:
features.shape

(21763, 400)

In [41]:
features_ivectore_test_set.shape

(1239, 400)

In [44]:
concated_features=np.concatenate ((features,features_ivectore_test_set),axis=0)

In [45]:
concated_dialects=np.concatenate ((dialects,dialects_ivectore_test_set),axis=0)

In [48]:
concated_features.shape

(23002, 400)

In [49]:
concated_dialects.shape

(23002,)

In [50]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(concated_features,concated_dialects,test_size=0.2,shuffle=True,random_state=12,stratify=concated_dialects)


In [51]:
model_ivectore_combined=SVM_classifier(X_train3,y_train3)


acc=model_ivectore_combined.score(X_test3,y_test3)
print("score: ",acc)
predicts=model_ivectore_combined.predict(X_test3)

labels=["hawrami", "kalhori", "zazaki","sorani","kurmanji"]
#labels=[1, 2, 3,4,5]
# cm=confusion_matrix(y_test,predicts,labels=labels)
# print(cm)
report_classifer(y_test3,predicts,labels)

score:  0.9363181916974571
-----------------------------
Report Classifier
[[ 332    2   22   10   22]
 [   0  997    6   16   14]
 [   1    9  992    9   22]
 [   1    7    9  824   56]
 [   2   22   38   25 1163]]
              precision    recall  f1-score   support

     hawrami       0.99      0.86      0.92       388
     kalhori       0.96      0.97      0.96      1033
      zazaki       0.91      0.93      0.92      1250
      sorani       0.93      0.92      0.93       897
    kurmanji       0.93      0.96      0.94      1033

    accuracy                           0.94      4601
   macro avg       0.94      0.93      0.93      4601
weighted avg       0.94      0.94      0.94      4601

Report Classifier
-----------------------------


In [53]:
model_ivectore_combined

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [55]:
save_model(model_ivectore_combined,"kdi_svm_ivector_trained_on_combinded_dataset.pkl")