## Effect of multilingual data on the model

This notebook contains the code necessary for evaluating the performance of the model trained on only the Enlish dataset and the model trained on the Multilingual dataset. The performance difference between the models is statistically evaluated to determine whether there are significant differences in performance.

In [None]:
import os
import tensorflow as tf
import shutil
import numpy as np
from subprocess import call
import sys
sys.path.insert(0, './src/')
from src import util
import IPython.display as ipd
import json
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error as mae
import seaborn as sns
import matplotlib.pyplot as plt
import statistics
import random
import csv
import scipy.stats as stats

## important methods

In [None]:
def stats_calc(y_true, y_pred):
    mae_ = mae(y_true, y_pred)
    stdev = np.std(abs(y_true.astype(np.float)-y_pred.astype(np.float)))
    rel_mae = np.mean(abs(y_pred-y_true)/ (y_true+0.00001))
    rel_err = (y_pred-y_true)/ (y_true+0.00001)
    return mae_,stdev,rel_mae, rel_err


In [None]:
def plot_data_dist(data):
    plt.hist(data, density=True)
    mu, sigma = stats.norm.fit(data)
    x = np.linspace(mu - 3* sigma, mu + 3*sigma,100 )
    plt.plot(x, stats.norm.pdf(x,mu,sigma))
    plt.show()

In [None]:
def ttest(x,y):
    t,p = stats.ttest_rel(x,y)
    return np.abs(t),p/2 #one sided t-test..

## Baseline testset

In [None]:
with open('/vol/tensusers3/camghane/ASR/predictions.json') as json_file:
    data = json.load(json_file)


In [None]:
baseline_english_y_true = data['y_true']
baseline_english_y_pred = data['y_pred']

cm_train = confusion_matrix(baseline_english_y_true, baseline_english_y_true, labels=range(11))

# Create figure
fig, ax = plt.subplots(1,1, figsize=(5,5))



# Plot confusion matrix for training data
sns.heatmap(cm_train, annot=True, fmt='g', ax=ax, cmap="Blues")

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix for Baseline Model on the English testset')

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.savefig("confusion_baseline.png", bbox_inches='tight')
plt.show() # ta-da!

# Show the result
# plt.show()

In [None]:
baseline_english_y_true = np.array(baseline_english_y_true)
baseline_english_y_pred = np.array(baseline_english_y_pred)

baseline_english_mae, baseline_english_stdev, baseline_english_relative_mae,baseline_english_rel_err = stats_calc(baseline_english_y_true, baseline_english_y_pred)

print("MAE: {}".format(baseline_english_mae))
print("MAE St.Dev.: {}".format(baseline_english_stdev))
print("Relative MAE: {}".format(baseline_english_relative_mae))

### Multilingual dataset

In [None]:
with open('/vol/tensusers3/camghane/ASR/MLS/predictions_experiment1.json') as json_file:
    data = json.load(json_file)


In [None]:
multilingual_english_y_true = data['y_true']
multilingual_english_y_pred = data['y_pred']

cm_train = confusion_matrix(multilingual_english_y_true, multilingual_english_y_pred, labels=range(11))

# Create figure
fig, ax = plt.subplots(1,1, figsize=(5,5))



# Plot confusion matrix for training data
sns.heatmap(cm_train, annot=True, fmt='g', ax=ax, cmap="Blues")

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix for MultiLingual Model on the English testset')

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.savefig("confusion_multilingualmodel_english_dataset.png", bbox_inches='tight')
plt.show() # ta-da!

# Show the result
# plt.show()

In [None]:
multilingual_english_y_true = np.array(multilingual_english_y_true)
multilingual_english_y_pred = np.array(multilingual_english_y_pred)

multilingual_english_mae, multilingual_english_stdev, multilingual_english_relative_mae,multilingual_english_rel_error = stats_calc(multilingual_english_y_true, multilingual_english_y_pred)

print("MAE: {}".format(multilingual_english_mae))
print("MAE St.Dev.: {}".format(multilingual_english_stdev))
print("Relative MAE: {}".format(multilingual_english_relative_mae))

## Statistics

In [None]:
print("Distribution of relative errors for the Baseline model on the English Testset")
plot_data_dist(baseline_english_rel_err)
print("Distribution of relative errors for the Multilingual model on the English Testset")

plot_data_dist(multilingual_english_rel_error)


In [None]:
tval,pval = ttest(baseline_english_rel_err, multilingual_english_rel_error)
print(f"The p val : {pval} and t val {tval} for one sided (paired) t-test on the English dataset")

## Multilingual testset

### baseline

In [None]:
with open('/vol/tensusers3/camghane/ASR/MLS/baseline_predictions_experiment1_multilingualdataset.json') as json_file:
    data = json.load(json_file)


In [None]:
baseline_multilingual_y_true = data['y_true']
baseline_multilingual_y_pred = data['y_pred']

cm_train = confusion_matrix(baseline_multilingual_y_true, baseline_multilingual_y_pred, labels=range(11))

# Create figure
fig, ax = plt.subplots(1,1, figsize=(5,5))



# Plot confusion matrix for training data
sns.heatmap(cm_train, annot=True, fmt='g', ax=ax, cmap="Blues")

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix for Baseline Model on the Multilingual testset')

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.savefig("confusion_baselinemodel_multilingual_dataset.png", bbox_inches='tight')
plt.show() # ta-da!

# Show the result
# plt.show()

In [None]:
baseline_multilingual_y_true = np.array(baseline_multilingual_y_true)
baseline_multilingual_y_pred = np.array(baseline_multilingual_y_pred)

baseline_multilingual_mae, baseline_multilingual_stdev, baseline_multilingual_relative_mae,baseline_multilingual_rel_error = stats_calc(baseline_multilingual_y_true, baseline_multilingual_y_pred)

print("MAE: {}".format(baseline_multilingual_mae))
print("MAE St.Dev.: {}".format(baseline_multilingual_stdev))
print("Relative MAE: {}".format(baseline_multilingual_relative_mae))

### multilingual model

In [None]:
with open('/vol/tensusers3/camghane/ASR/MLS/multilingual_predictions_experiment1_multilingualdataset.json') as json_file:
    data = json.load(json_file)


In [None]:
multilingual_multilingual_y_true = data['y_true']
multilingual_multilingual_y_pred = data['y_pred']

cm_train = confusion_matrix(multilingual_multilingual_y_true, multilingual_multilingual_y_pred, labels=range(11))

# Create figure
fig, ax = plt.subplots(1,1, figsize=(5,5))



# Plot confusion matrix for training data
sns.heatmap(cm_train, annot=True, fmt='g', ax=ax, cmap="Blues")

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix for MultiLingual Model on the Multilingual testset')

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.savefig("confusion_multilingualmodel_multilingual_dataset.png", bbox_inches='tight')
plt.show() # ta-da!

# Show the result
# plt.show()

In [None]:
multilingual_multilingual_y_true = np.array(multilingual_multilingual_y_true)
multilingual_multilingual_y_pred = np.array(multilingual_multilingual_y_pred)

multilingual_multilingual_mae, multilingual_multilingual_stdev, multilingual_multilingual_relative_mae,multilingual_multilingual_rel_error = stats_calc(multilingual_multilingual_y_true, multilingual_multilingual_y_pred)

print("MAE: {}".format(multilingual_multilingual_mae))
print("MAE St.Dev.: {}".format(multilingual_multilingual_stdev))
print("Relative MAE: {}".format(multilingual_multilingual_relative_mae))

## Statistics

In [None]:
print("Distribution of relative errors for the Baseline model on the Multilingual Testset")
plot_data_dist(baseline_multilingual_rel_error)
print("Distribution of relative errors for the Multilingual model on the Multilingual Testset")

plot_data_dist(multilingual_multilingual_rel_error)


In [None]:
multilingual_tval,multilingual_pval = ttest(baseline_multilingual_rel_error, multilingual_multilingual_rel_error)
print(f"The p val : {multilingual_pval} and t val : {multilingual_tval} for one sided (paired) t-test on the Multilingual dataset")

## Unseen Data

In this section the performance of the baseline and multilingual model will be evaluated on the Spanish and Polish datasets.

### baseline model

In [None]:
with open('/vol/tensusers3/camghane/ASR/MLS/baseline_predictions_experiment2_spanishdataset.json') as json_file:
    data = json.load(json_file)

In [None]:
baseline_spanish_y_true = data['y_true']
baseline_spanish_y_pred = data['y_pred']

cm_train = confusion_matrix(baseline_spanish_y_true, baseline_spanish_y_pred, labels=range(11))

# Create figure
fig, ax = plt.subplots(1,1, figsize=(5,5))



# Plot confusion matrix for training data
sns.heatmap(cm_train, annot=True, fmt='g', ax=ax, cmap="Blues")

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix for Baseline Model on the Spanish testset')

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.savefig("confusion_multilingualmodel_multilingual_dataset.png", bbox_inches='tight')
plt.show() # ta-da!

# Show the result
# plt.show()

In [None]:
baseline_spanish_y_true = np.array(baseline_spanish_y_true)
baseline_spanish_y_pred = np.array(baseline_spanish_y_pred)

baseline_spanish_mae, baseline_spanish_stdev, baseline_spanish_relative_mae, baseline_spanish_rel_error = stats_calc(baseline_spanish_y_true, baseline_spanish_y_pred)

print("MAE: {}".format(baseline_spanish_mae))
print("MAE St.Dev.: {}".format(baseline_spanish_stdev))
print("Relative MAE: {}".format(baseline_spanish_relative_mae))

### multilingual model

In [None]:
with open('/vol/tensusers3/camghane/ASR/MLS/multilingual_predictions_experiment2_spanishdataset.json') as json_file:
    data = json.load(json_file)

In [None]:
multilingual_spanish_y_true = data['y_true']
multilingual_spanish_y_pred = data['y_pred']

cm_train = confusion_matrix(multilingual_spanish_y_true, multilingual_spanish_y_pred, labels=range(11))

# Create figure
fig, ax = plt.subplots(1,1, figsize=(5,5))



# Plot confusion matrix for training data
sns.heatmap(cm_train, annot=True, fmt='g', ax=ax, cmap="Blues")

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix for MultiLingual Model on the Spanish testset')

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.savefig("confusion_multilingualmodel_multilingual_dataset.png", bbox_inches='tight')
plt.show() # ta-da!

# Show the result
# plt.show()

In [None]:
multilingual_spanish_y_true = np.array(multilingual_spanish_y_true)
multilingual_spanish_y_pred = np.array(multilingual_spanish_y_pred)

multilingual_spanish_mae, multilingual_spanish_stdev, multilingual_spanish_relative_mae, multilingual_spanish_rel_error = stats_calc(multilingual_spanish_y_true, multilingual_spanish_y_pred)

print("MAE: {}".format(multilingual_spanish_mae))
print("MAE St.Dev.: {}".format(multilingual_spanish_stdev))
print("Relative MAE: {}".format(multilingual_spanish_relative_mae))

## polish dataset

### baseline model

In [None]:
with open('/vol/tensusers3/camghane/ASR/MLS/baseline_predictions_experiment2_polishdataset.json') as json_file:
    data = json.load(json_file)

In [None]:
baseline_polish_y_true = data['y_true']
baseline_polish_y_pred = data['y_pred']

cm_train = confusion_matrix(baseline_polish_y_true, baseline_polish_y_pred, labels=range(11))

# Create figure
fig, ax = plt.subplots(1,1, figsize=(5,5))



# Plot confusion matrix for training data
sns.heatmap(cm_train, annot=True, fmt='g', ax=ax, cmap="Blues")

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix for Baseline Model on the Polish testset')

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.savefig("confusion_multilingualmodel_multilingual_dataset.png", bbox_inches='tight')
plt.show() # ta-da!

# Show the result
# plt.show()

In [None]:
baseline_polish_y_true = np.array(baseline_polish_y_true)
baseline_polish_y_pred = np.array(baseline_polish_y_pred)

baseline_polish_mae, baseline_polish_stdev, baseline_polish_relative_mae, baseline_polish_rel_error = stats_calc(baseline_polish_y_true, baseline_polish_y_pred)

print("MAE: {}".format(baseline_polish_mae))
print("MAE St.Dev.: {}".format(baseline_polish_stdev))
print("Relative MAE: {}".format(baseline_polish_relative_mae))

### multilingual model

In [None]:
with open('/vol/tensusers3/camghane/ASR/MLS/multilingual_predictions_experiment2_polishdataset.json') as json_file:
    data = json.load(json_file)

In [None]:
multilingual_polish_y_true = data['y_true']
multilingual_polish_y_pred = data['y_pred']

cm_train = confusion_matrix(multilingual_polish_y_true, multilingual_polish_y_pred, labels=range(11))

# Create figure
fig, ax = plt.subplots(1,1, figsize=(5,5))



# Plot confusion matrix for training data
sns.heatmap(cm_train, annot=True, fmt='g', ax=ax, cmap="Blues")

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix for MultiLingual Model on the Polish testset')

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.savefig("confusion_multilingualmodel_multilingual_dataset.png", bbox_inches='tight')
plt.show() # ta-da!

# Show the result
# plt.show()

In [None]:
multilingual_polish_y_true = np.array(multilingual_polish_y_true)
multilingual_polish_y_pred = np.array(multilingual_polish_y_pred)

multilingual_polish_mae, multilingual_polish_stdev, multilingual_polish_relative_mae, multilingual_polish_rel_error = stats_calc(multilingual_polish_y_true, multilingual_polish_y_pred)

print("MAE: {}".format(multilingual_polish_mae))
print("MAE St.Dev.: {}".format(multilingual_polish_stdev))
print("Relative MAE: {}".format(multilingual_polish_relative_mae))

## Statistics

In [None]:
print("Distribution of relative errors for the Baseline model on the Spanish Testset")
plot_data_dist(baseline_spanish_rel_error)
print("Distribution of relative errors for the Multilingual model on the Spanish Testset")

plot_data_dist(multilingual_spanish_rel_error)


In [None]:
spanish_tval, spanish_pval = ttest(baseline_spanish_rel_error, multilingual_spanish_rel_error)
print(f"The p val : {spanish_pval} and t val : {spanish_tval} for one sided (paired) t-test on the Spanish dataset")

In [None]:
print("Distribution of relative errors for the Baseline model on the Polish Testset")
plot_data_dist(baseline_polish_rel_error)
print("Distribution of relative errors for the Multilingual model on the Polish Testset")
plot_data_dist(multilingual_polish_rel_error)


In [None]:
polish_tval, polish_pval = ttest(baseline_polish_rel_error, multilingual_polish_rel_error)
print(f"The p val : {polish_pval} and t val : {polish_tval} for one sided (paired) t-test on the Polish dataset")