# Non-English and Mixed-Language Hypothesis

There appear to be a number of non-English and mixed-language tweets within our
dataset. In particular, these can be found amongst tweets misclassified by
BERTweet. Since BERTweet's embedding is trained on purely English tweets, we
cannot expect to correctly understand and classify such non-English/mixed-language
tweets without additional measures. We hypothesise that translating such tweets
using a service such as Google Translate would increase the accuracy of BERTweet.

For this we use the Google Translate API: https://codelabs.developers.google.com/codelabs/cloud-translation-python3

In [None]:
# All results have been precomputed already and included within this repository.
# This is done as some operations take extensive amounts of time and require
# the user to setup Google Cloud Services. Nevertheless, the code within this
# notebook can still be used to reproduce our results. The following flag, when
# set to True will only use the precomputed data we provide.
USE_PRECOMPUTED = True

In [None]:
# Google project ID.
%env PROJECT_ID=cli-project
# Path to key.json containing credentials.
%env GOOGLE_APPLICATION_CREDENTIALS=./key.json

import sys
import os
from os import environ

from google.cloud import translate

import pandas as pd
from IPython.display import display

sys.path.append("../../src")
from data_processing.loading import load_train_valid_data, load_test_data

project_id = environ.get("PROJECT_ID", "")
assert project_id
parent = f"projects/{project_id}"
client = translate.TranslationServiceClient()

## Detect Languages

The first step in testing our hypothesis is to identify non-English or
mixed-language tweets. We do this to keep the translation workload to a minimum,
as we will only translate non-English and mixed-language tweets. We further use
this step to identify the language composition of our dataset.

In [None]:
# Load random subset of BERTweet misclassified tweets.
valid_misclass = pd.read_csv("bertweet_valid_misclass.csv")

# Load test tweets.
path_to_dataset = os.path.join(os.pardir, os.pardir, "dataset")
test = load_test_data(path_to_dataset)

def detect_language(client, parent, text):
    try:
        response = client.detect_language(parent=parent, content=text)
        num_langs_detected = len(response.languages)
        first_detected_lang = response.languages[0].language_code
        confidence_first_lang = response.languages[0].confidence
    except:
        return ",,"
    return f"{num_langs_detected},{first_detected_lang},{confidence_first_lang}"

if not USE_PRECOMPUTED:
    # Detect languages for misclassified validation tweets and test tweets.
    # Save the results.
    valid_misclass["detect_lang"] = valid_misclass["tweet"].apply(
        lambda tweet: detect_language(client, parent, tweet)
    )
    valid_misclass.to_csv(
        os.path.join("language_detected", "valid_misclass.csv")
    )

    test["detect_lang"] = test["tweet"].apply(
        lambda tweet: detect_language(client, parent, tweet)
    )
    test.to_csv(os.path.join("language_detected", "test.csv"))

## Translate Tweets

Now that languages have been detected, filter out all tweets found to be
non-English, i.e. completely non-English or mixed-language, and translate
those.

In [None]:
# Prepare the language detected tweets for translation and filter for the
# non-English ones.

# Misclassified validation tweets.
valid_misclass = pd.read_csv(
    os.path.join('language_detected', 'valid_misclass.csv'),
    index_col='id'
)

valid_misclass['num_detect_langs'] = valid_misclass['detect_lang'].apply(
    lambda stat: stat.split(',')[0]
)
valid_misclass['detected_lang'] = valid_misclass['detect_lang'].apply(
    lambda stat: stat.split(',')[1]
)
valid_misclass['confidence'] = valid_misclass['detect_lang'].apply(
    lambda stat: stat.split(',')[2]
)

del valid_misclass['detect_lang']
del valid_misclass['Unnamed: 0']

valid_foreign_lang = valid_misclass[valid_misclass['detected_lang'] != 'en'].copy()

# Test tweets.
test = pd.read_csv(
    os.path.join('language_detected', 'test_lang_detected.csv'),
    index_col='id'
)

test['num_detect_langs'] = test['detect_lang'].apply(
    lambda stat: stat.split(',')[0]
)
test['detected_lang'] = test['detect_lang'].apply(
    lambda stat: stat.split(',')[1]
)
test['confidence'] = test['detect_lang'].apply(
    lambda stat: stat.split(',')[2]
)

del test['detect_lang']

test_foreign_lang = test[test['detected_lang'] != 'en'].copy()

In [None]:
# Translate the relevant tweets and store them for reclassification.

def translate(text: str, target_language_code: str) -> str:
    response = client.translate_text(
        contents=[text],
        target_language_code=target_language_code,
        parent=parent,
    )
    return response.translations

def translate_to_en(tweet: str) -> str:
    raw_translation = translate(tweet, 'en')[0].translated_text
    translation = raw_translation.replace('&#39;', "'")
    translation = translation.replace('&lt;', '<')
    translation = translation.replace('&gt;', '>')
    translation = translation.replace('&quot;', '"')
    return translation

if not USE_PRECOMPUTED:
    # Misclassified validation tweets.
    valid_foreign_lang['translation'] = valid_foreign_lang['tweet'].apply(
        lambda tweet: translate_to_en(tweet)
    )

    translated_valid_misclass = valid_foreign_lang[['tweet', 'translation', 'label']].copy()
    translated_valid_misclass['tweet'] = translated_valid_misclass['translation']

    del translated_valid_misclass['translation']

    translated_valid_misclass.to_csv(
        os.path.join('translated', 'valid_misclass.csv')
    )

    # Test tweets.
    test_foreign_lang['translation'] = test_foreign_lang['tweet'].apply(
        lambda tweet: translate_to_en(tweet)
    )

    translated_test = test_foreign_lang[['tweet', 'translation']].copy()
    translated_test['tweet'] = translated_test['translation']

    del translated_test['translation']

    translated_test.to_csv(os.path.join('translated', 'test.csv'))

## Merging Re-classified Tweets

Using the CSV files generated in the last section, we can reclassify the
translated tweets to check if accuracy improved. What we do now is merge the
classifications of the tranlated tweets with the classifications of the English
tweets we have not changed anything for.

In [None]:
# Merging re-classified validation tweets.
bertweet_valid_pred = pd.read_csv(
    os.path.join('predictions', 'bertweet_valid.csv'),
    index_col='Id'
)
bertweet_valid_translated_pred = pd.read_csv(
    os.path.join('predictions', 'bertweet_valid_translated.csv'),
    index_col='Id'
)

path_to_dataset = os.path.join(os.pardir, os.pardir, 'dataset')
train, valid = load_train_valid_data(path_to_dataset)

bertweet_combined_valid_pred = bertweet_valid_pred.join(
    bertweet_valid_translated_pred,
    lsuffix='_original',
    rsuffix='_translated'
)

bertweet_merged_valid_pred = bertweet_combined_valid_pred
bertweet_merged_valid_pred['Prediction'] = bertweet_merged_valid_pred['Prediction_translated'].fillna(
    bertweet_merged_valid_pred['Prediction_original']
)
bertweet_merged_valid_pred['Prediction'] = bertweet_merged_valid_pred['Prediction'].astype(int)

bertweet_merged_valid_pred = bertweet_merged_valid_pred.join(valid)

In [None]:
# Print summary statistics on accuracy before and after translation.
valid_size = len(valid)

num_bertweet_correct = len(
    bertweet_merged_valid_pred[bertweet_merged_valid_pred['Prediction_original'] == bertweet_merged_valid_pred['label']]
)

num_bertweet_translated_correct = len(
    bertweet_merged_valid_pred[bertweet_merged_valid_pred['Prediction'] == bertweet_merged_valid_pred['label']]
)

num_changed_pred = len(
    bertweet_merged_valid_pred[bertweet_merged_valid_pred['Prediction_original'] != bertweet_merged_valid_pred['Prediction']]
)

print(f'BERTweet Original Accuracy:\t\t\t{100 * num_bertweet_correct / valid_size}')
print(f'BERTweet Translated Accuracy:\t\t\t{100 * num_bertweet_translated_correct / valid_size}')
print(f'Accuracy Difference Original vs. Translated:\t0{100 * (num_bertweet_translated_correct - num_bertweet_correct) / valid_size}')
print(f'Percentage of changed prediction:\t\t0{100 * num_changed_pred / valid_size}')

In [None]:
# Merging re-classified test tweets.
bertweet_test_pred = pd.read_csv(
    os.path.join('predictions', 'bertweet_test.csv'),
    index_col='Id'
)
bertweet_test_translated_pred = pd.read_csv(
    os.path.join('predictions', 'bertweet_test_translated.csv'),
    index_col='Id'
)

bertweet_joint_pred = bertweet_test_pred.join(
    bertweet_test_translated_pred,
    lsuffix='_original',
    rsuffix='_translated'
)
bertweet_merged_pred = bertweet_joint_pred
bertweet_merged_pred['Prediction'] = bertweet_merged_pred['Prediction_translated'].fillna(
    bertweet_merged_pred['Prediction_original']
)
bertweet_merged_pred['Prediction'] = bertweet_merged_pred['Prediction'].astype(int)

bertweet_new_pred = bertweet_merged_pred.copy()
del bertweet_new_pred['Prediction_original']
del bertweet_new_pred['Prediction_translated']
if not USE_PRECOMPUTED:
    bertweet_new_pred.to_csv(
        os.path.join('predictions', 'bertweet_test_merged.csv')
    )