In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import  confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

# i-Vector

In [2]:
def report_classifer(y_test,predicts,target_names):
    print("-----------------------------")
    print("Report Classifier")
    cm=confusion_matrix(y_test,predicts,labels=target_names)
    print(cm)
    print(classification_report(y_test, predicts, target_names=target_names))
    print("Report Classifier")
    print("-----------------------------")

In [3]:
def read_data_set(csv_vectore,number_feature=401):
    vdf=pd.read_csv(csv_vectore,header=None)
    columns=[ "f{}".format(i) for i in range(1,number_feature)]
    columns.append('class')
    vdf.columns=columns
    features=vdf.iloc[:,0:-1]
    dialects=vdf.iloc[:,-1]
    return features,dialects

In [4]:
features,dialects=read_data_set("kurdish_dialect_vectors/i-vectors.csv")


In [5]:
metrics=['precision_macro', 'recall_macro','f1_macro','accuracy']

## Evaluation on SVM

In [6]:
from sklearn.svm import SVC


In [7]:
svc = SVC(gamma='auto',random_state=42)
scores_svc = cross_validate(svc, features, dialects, cv=5,verbose=1,scoring=metrics)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  9.2min finished


In [9]:
scores_svc

{'fit_time': array([ 80.34038544,  93.49890471,  89.8349278 ,  95.4978776 ,
        100.26090145]),
 'score_time': array([17.8331244 , 17.90214086, 18.12675238, 18.59246302, 18.78577709]),
 'test_precision_macro': array([0.75915324, 0.88422508, 0.8518159 , 0.93785604, 0.93075817]),
 'test_recall_macro': array([0.68435873, 0.81753112, 0.78314302, 0.93001267, 0.92212733]),
 'test_f1_macro': array([0.68672608, 0.82591334, 0.79534975, 0.93355808, 0.92595182]),
 'test_accuracy': array([0.68355203, 0.86176381, 0.81660055, 0.9337809 , 0.92612943])}

In [10]:
print("Precision on CSV: ", round(scores_svc["test_precision_macro"].mean(),2))
print("Recall on CSV : ", round(scores_svc["test_recall_macro"].mean(),2))
print("F1_score  on CSV: ", round(scores_svc["test_f1_macro"].mean(),2))
print("Acuracy  on CSV: ", round(scores_svc["test_accuracy"].mean(),2))

Precision on CSV:  0.87
Recall on CSV :  0.83
F1_score  on CSV:  0.83
Acuracy  on CSV:  0.84


# Evaluation On Decision Tree


In [11]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
dt = DecisionTreeClassifier(criterion='entropy',random_state=42)
scores_dt = cross_validate(dt, features, dialects, cv=5,verbose=1,scoring=metrics)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished


In [13]:
scores_dt

{'fit_time': array([14.79413509, 15.07838273, 16.23881531, 15.71446395, 15.92835283]),
 'score_time': array([0.04606724, 0.04750133, 0.04614282, 0.04696321, 0.0469718 ]),
 'test_precision_macro': array([0.55695788, 0.66442563, 0.60559676, 0.76176744, 0.76699534]),
 'test_recall_macro': array([0.55768373, 0.63350734, 0.59043466, 0.75761609, 0.75522425]),
 'test_f1_macro': array([0.55412858, 0.63716984, 0.59201302, 0.75800884, 0.75872793]),
 'test_accuracy': array([0.55660665, 0.66859933, 0.61641745, 0.75434849, 0.75030525])}

In [14]:
print("Precision on DT: ", round(scores_dt["test_precision_macro"].mean(),2))
print("Recall on DT: ", round(scores_dt["test_recall_macro"].mean(),2))
print("F1_score on DT: ", round(scores_dt["test_f1_macro"].mean(),2))
print("Acuracy on DT: ", round(scores_dt["test_accuracy"].mean(),2))

Precision on DT:  0.67
Recall on DT:  0.66
F1_score on DT:  0.66
Acuracy on DT:  0.67


-------------------

# Confusion Matrix on SVM

In [15]:
from sklearn.model_selection import cross_val_predict

In [16]:
labels=["hawrami", "kalhori", "kurmanji","sorani","zazaki"]

In [17]:
svc = SVC(gamma='auto',random_state=42)
y_pred = cross_val_predict(svc, features, dialects, cv=5,verbose=1)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.8min finished


In [18]:
report_classifer(dialects, y_pred,labels)

-----------------------------
Report Classifier
[[1292   85  144  118  237]
 [  29 3254   67   66  264]
 [  22   25 2814  320  422]
 [  27    9  322 2783  245]
 [  10   11  114   13 3691]]
              precision    recall  f1-score   support

     hawrami       0.94      0.69      0.79      1876
     kalhori       0.96      0.88      0.92      3680
    kurmanji       0.81      0.78      0.80      3603
      sorani       0.84      0.82      0.83      3386
      zazaki       0.76      0.96      0.85      3839

    accuracy                           0.84     16384
   macro avg       0.86      0.83      0.84     16384
weighted avg       0.85      0.84      0.84     16384

Report Classifier
-----------------------------


---------------------

# Confusion Matrix on DT

In [19]:
dt = DecisionTreeClassifier(criterion='entropy',random_state=42)
y_pred = cross_val_predict(dt, features, dialects, cv=5,verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished


In [20]:
report_classifer(dialects, y_pred,labels)

-----------------------------
Report Classifier
[[1094  129  235  187  231]
 [ 169 2594  274  265  378]
 [ 120  156 2175  801  351]
 [ 114  173  752 2119  228]
 [ 106  233  322  195 2983]]
              precision    recall  f1-score   support

     hawrami       0.68      0.58      0.63      1876
     kalhori       0.79      0.70      0.74      3680
    kurmanji       0.58      0.60      0.59      3603
      sorani       0.59      0.63      0.61      3386
      zazaki       0.72      0.78      0.74      3839

    accuracy                           0.67     16384
   macro avg       0.67      0.66      0.66     16384
weighted avg       0.67      0.67      0.67     16384

Report Classifier
-----------------------------
