# This is the driver file for the full pseudo transcript, TF-IDF based subset selection experiment

### Creating pseudo transcripts

In [1]:
import sys
import torch

In [2]:
print(sys.executable)

/home/mayank/.conda/envs/error/bin/python


In [3]:
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
print(device)

cuda:3


In [47]:
BASE_PATH = '/home/mayank/MTP/begin_again/Error-Driven-ASR-Personalization'
CURR_DIR = BASE_PATH + '/entropy-testing/pseudo-transcript-entropy'
PARENT_DIR = BASE_PATH + '/entropy-testing'

In [48]:
ACCENTS = list(map(lambda x : x + '_english', [
    'assamese_female', 
    'gujarati_female',
    'hindi_male',
    'kannada_male',
    'malayalam_male',
    'manipuri_female',
    'rajasthani_male',
    'tamil_male'
]))

BUDGETS= [
    '100',
    '200',
    '400',
    '800'
]

TARGET = [10]

METHODS = ['FL2MI', 'GCMI', 'LogDMI']

ETA = ['1.0']

SIM = ['euclidean']

FEATURES = ['39']

RUNS = ['1', '2', '3']

In [None]:
# Do the SMI generation

def doSMI(feature, similarity, eta, target, budget, method, accent):
    
    python_file = CURR_DIR + "/TSS.py"
    
    print("----------------------- TSS -----------------------")
    print(f"accent_{accent}, budget_{budget}, method_{method}")
    !$sys.executable $python_file --target $target --budget $budget --similarity $similarity --eta $eta --accent $accent --fxn $method --feature_type $feature
    

def generate_SMI_selections():
    for feature in FEATURES:
        for sim in SIM:
            for eta in ETA:
                for target in TARGET:
                    for budget in BUDGETS:
                        for method in METHODS:
                            for accent in ACCENTS:
                                doSMI(
                                    feature = feature,
                                    similarity = sim,
                                    eta = eta,
                                    target = target,
                                    budget = budget,
                                    method = method,
                                    accent = accent
                                )

generate_SMI_selections()

### Generate the transcripts and do the grapheme to phoneme.
-  We'll be directly using the code from error-model here

### Generate the transcripts

In [None]:
def infer_transcripts(feature, similarity, eta, target, budget, method, accent, run):
    python_env = sys.executable
    python_file = CURR_DIR + "/models/quartznet_asr/inference.py"
    data_base_dir = CURR_DIR + f"/data/{accent}/manifests/TSS_output/all/budget_{budget}/target_{target}/{method}/eta_{eta}/{similarity}/{feature}/run_{run}/"
    wav_dir=BASE_PATH + "/data/indicTTS_audio/indicTTS/{accent}/english/wav/"
    ckpt_base_dir=CURR_DIR + "/models/pretrained_checkpoints/"
    batch_size=32
    bash_file = CURR_DIR + "/models/quartznet_asr/scripts/infer_transcriptions_on_seed_set.sh"

    
    print("------ Generating Pseudo Transcripts -------")
    print(f"accent_{accent}, budget_{budget}, method_{method}, run_{run}")
    !bash $bash_file $python_env $python_file $data_base_dir $wav_dir $ckpt_base_dir $batch_size
    
def infer_transcripts_all():
    ACCENTS = ["kannada_male_english"]
    METHODS = ["FL2MI"]
    BUDGETS = [800]
    for feature in FEATURES:
        for sim in SIM:
            for eta in ETA:
                for target in TARGET:
                    for budget in BUDGETS:
                        for method in METHODS:
                            for run in RUNS:
                                for accent in ACCENTS:
                                    infer_transcripts(
                                        feature = feature,
                                        similarity = sim,
                                        eta = eta,
                                        target = target,
                                        budget = budget,
                                        method = method,
                                        accent = accent,
                                        run = run
                                    )
    

In [None]:
infer_transcripts_all()

#### Get the phoneme versions

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

a = " ".join(list(map(str, [1,1,2,4,5,6,125,54,456])))
b = " ".join(list(map(str, [4])))
print(a)
print(b)
corpus = [b,a,a]
vectorizer = TfidfVectorizer(lowercase=False, token_pattern='(?u)\\b\\w+\\b', ngram_range=(1,3))
X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names_out())
# print(len(vectorizer.get_feature_names_out()))
data = X.toarray()
print(data)

#### Do the TF-IDF vectors

#### Make selections using some submodlib utility

In [None]:
from submodlib import FacilityLocationFunction
objFL = FacilityLocationFunction(n = data.shape[0], data=data, mode="dense", metric = "euclidean")

In [None]:
objFL.maximize(budget=2, optimizer="NaiveGreedy")

In [52]:
python_file = CURR_DIR + "/models/error_model/select_tf_idf.py"
json_path = CURR_DIR + "/quartznet_outputs/infer_out.txt"
print("----------------------- TF- IDF based TSS -----------------------")
!$sys.executable $python_file --json_path $json_path

----------------------- TF- IDF based TSS -----------------------
Arguments:
	json_path : /home/mayank/MTP/begin_again/Error-Driven-ASR-Personalization/entropy-testing/pseudo-transcript-entropy/quartznet_outputs/infer_out.txt
	     seed : 42
loading data....
100%|████████████████████████████████████████| 721/721 [00:01<00:00, 401.11it/s]
<class 'list'>
i could not agree with ernest
100%|████████████████████████████████████████| 721/721 [00:03<00:00, 211.73it/s]
<class 'list'>
['<s>', 'AY', ' ', 'K', 'UH', 'D', ' ', 'N', 'AA', 'T', ' ', 'AH', 'G', 'R', 'IY', ' ', 'W', 'IH', 'DH', ' ', 'ER', 'N', 'AH', 'S', 'T', '</s>']
**********Sample phoneme data******
['<s>', 'AY', ' ', 'K', 'UH', 'D', ' ', 'N', 'AA', 'T', ' ', 'AH', 'G', 'R', 'IY', ' ', 'W', 'IH', 'DH', ' ', 'ER', 'N', 'AH', 'S', 'T', '</s>']
data_size: 721
**** Converting phonemes to ids*****
Sample phonemes converted to ids
2 9 44 23 36 12 44 26 4 34 44 6 18 31 21 44 39 20 13 44 15 26 6 32 34 3
TF-IDF vectorised data
[0. 0. 0. ...