#  English-to-BASH

#### Download Huggingface transformer library, import required libraries, and connect all data

In [None]:
!pip3 install -q git+https://github.com/huggingface/transformers.git
# !pip install -q tensorflow==2.1
!pip3 install torch==1.2.0 torchvision==0.4.0 -f
!pip3 install datasets transformers[sentencepiece] sacrebleu
!pip3 install sacremoses
!pip3 install datasets

In [None]:
import transformers

print(transformers.__version__)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install datasets

In [None]:
# Importing libraries.
import pandas as pd
from sklearn.model_selection import train_test_split

import datasets

from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer

import numpy as np

#### Data Preprocessing

In [None]:
def load_csv(input, output):
  # Input: csv_file_name
  # Output: pandas data frame
  input_df = pd.read_csv(input, sep="\n", header=None, names=['input'])
  output_df = pd.read_csv(output, sep="\n", header=None, names=['output'])
  return input_df, output_df

In [None]:
input, output = load_csv('balanced_user_input_NL2BASH.txt', 'balanced_commands_NL2BASH.txt')

overview = pd.concat([input, output], axis=1)
overview

##### View Data Distribution

In [None]:
def cleaning(dataframe, original):
  dataframe = dataframe.str.replace("[\(\[].*?[\)\]] ",'', regex=True)
  dataframe = dataframe.str.replace('"', '', regex=True)
  # ----- Uncomment for overview dataframe
  if original == False:
    dataframe = dataframe.str.split(' ', n=1, expand=True) 
    dataframe = dataframe.rename(columns={0 : "command_1", 1 : "miscellaneous"})
  # -----
  return dataframe

In [None]:
df_distribution = cleaning(overview['output'], original=False)
df_distribution

In [None]:
# user_input_cleaned = cleaning(overview['input'], original=True)
# user_input_cleaned.to_csv('cleaned_user_input.txt', index=None, header=False)
# user_input_cleaned[5:]

##### Removing Bias

In [None]:
import logging 

max_threshold = 1000

for i in range(len(overview)):
    try:
        logging.info("running line: ",i)
        delta = pd.value_counts(df_distribution["command_1"])[0] - pd.value_counts(df_distribution["command_1"]).loc[overview['output'][i].split()[0]]
        # print(overview['output'][i].split()[0])
        # print(pd.value_counts(df_distribution["command_1"])[0] - pd.value_counts(df_distribution["command_1"]).loc[overview['output'][i].split()[0]])

        # command starts with find
        if delta < max_threshold: 
            overview.drop([i], inplace=True)
    except Exception as e:
        logging.warn(i, "error: ", str(e))

In [None]:
overview

In [None]:
# overview["input"].to_csv('balanced_user_input.txt', index=False, header=False)
# overview["output"].to_csv('balanced_commands.txt', index=False, header=False)

In [None]:
df_distribution_post = cleaning(overview['output'], original=False)
df_distribution_post

In [None]:
import seaborn as sns

sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(30,10)})

sns.countplot(x=df_distribution_post["command_1"], order=pd.value_counts(df_distribution_post["command_1"]).iloc[:5].index)

In [None]:
import seaborn as sns

sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(30,10)})

sns.countplot(x=df_distribution["command_1"], order=pd.value_counts(df_distribution["command_1"]).iloc[:25].index)

In [None]:
## NL2Bash Dataset Augmentation:
# Find the delta (rep_max - rep_indexed_command)
# Append delta rows, add linux BASH to respective row
# Use openai.API to generate delta user query

# ----- Uncomment when complete 
# for i in range(len(df_distribution["command_1"])):
# -----

def add_rows(number_rows, linux_bash):
  new_row = {'input' : "test description", 'output' : linux_bash}
  for i in range(number_rows):
    test_df = overview.append(new_row, ignore_index=True)
    print("i: ", i)
  return test_df

for i in range(1):
  first_word = df_distribution["command_1"][i] 
  delta = pd.value_counts(df_distribution["command_1"])[0] - pd.value_counts(df_distribution["command_1"]).loc[first_word]
  print("\nfirst_word: ", first_word)
  print("linux bash: ", overview['output'][i])
  print("delta: ", delta)

  test_df = add_rows(delta, overview['output'][i])

test_df

In [None]:
df = overview
idx = list(df.index)
df

##### **Data Split**
*   Train: 10085 entries
*   Test: 1261 entries
*   Validation: 1261 entries

In [None]:
x_train, x_test, y_train, y_test = train_test_split(input, output, train_size=0.8, test_size=0.2, shuffle=True, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, train_size=0.5, test_size=0.5, shuffle=True, random_state=0)

train = pd.concat([x_train, y_train], axis=1)
test = pd.concat([x_test, y_test], axis=1)
valid = pd.concat([x_val, y_val], axis=1)

In [None]:
train_dataset = Dataset.from_pandas(train)
train_dataset = train_dataset.remove_columns(["__index_level_0__"])

test_dataset = Dataset.from_pandas(test)
test_dataset = test_dataset.remove_columns(["__index_level_0__"])

valid_dataset = Dataset.from_pandas(valid)
valid_dataset = valid_dataset.remove_columns(["__index_level_0__"])

raw_datasets = DatasetDict({"train":train_dataset, "test":test_dataset, "valid":valid_dataset})
raw_datasets

In [None]:
raw_datasets['train'][400]

In [None]:
raw_datasets['test'][30]

#### Fine-tuning: Load Model Checkpoint, Tokenizer, Metrics, Pretrained Weights

In [None]:
model_checkpoint = 't5-small'
# model_checkpoint = 't5-large'
# model_checkpoint = 'google/byt5-small'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex for ex in examples['input']]
    targets = [ex for ex in examples['output']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [None]:
!pip3 install sacrebleu
metric = load_metric("sacrebleu")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-English-to-BASH",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#### Initialize and Begin Training

In [None]:

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

#### Testing

In [None]:
!pip3 install openai

In [None]:
import os
import openai
import pprint
import logging
import time

# openai.api_key = "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" <- Alex
openai.api_key = "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" 

In [None]:
def retrieve_text_pair(text):
    descriptions = []
    commands = []
    
    for line in text.split("\n"):
        line = line.strip()
        line = line.split(":")
        
        print(line)
        if "Description" in line:
            descriptions.append(':'.join(line[1:]))
            # print("descriptions: ", descriptions)
        elif "Command" in line:
            commands.append(':'.join(line[1:]))
            # print("commands: ", commands)
        else:
            pass
    valid_len = min(len(descriptions), len(commands)) - 1
    return descriptions[:valid_len], commands[:valid_len]

# text = """Description: when using vi-insert keymap bind command \\C-v} to key }\n\n
# Command: bind -m vi-insert \'"}" "\\C-v}"\'\n\n
# Description:  when using vi-insert keymap bind command \\C-v[ to key [\n\n
# Command: bind -m vi-insert \'"[" "\\C-v["\'\n\n"""


# text_pair = retrieve_text_pair(text)
# len(text_pair[0])

In [None]:
def write_to_file(text_pair, description_path, command_path):
    description_file = open(description_path, 'a')
    command_file = open(command_path, 'a')
    for d, c in zip(text_pair[0], text_pair[1]): 
        description_file.write(d.strip()+'\n')
        command_file.write(c.strip()+'\n')
    description_file.close()
    command_file.close()

# write_to_file(text_pair, "output_description.txt", "output_command.txt")

In [None]:
def get_response(num, description, command):
    prompt = \
    f"""Generate {num} similar bash commands and corresponding description:
    Description: {description}
    Command: {command}
    """

    response = openai.Completion.create(
    engine="text-davinci-002",
    prompt=prompt,
    temperature=0.0,
    max_tokens=4000,
    top_p=1.0,
    frequency_penalty=0.2,
    presence_penalty=0.2
    )
    return response.choices[0]["text"]
    # return response.choices[0]["text"]

In [None]:
test_file = open('balanced_user_input.txt', 'r', encoding="utf8")
test_file_2 = open('balanced_commands.txt', 'r', encoding="utf8")
test_list = test_file.readlines()
test_list_2 = test_file_2.readlines()

# test_list_2

for i, (des, com) in enumerate(zip(test_list, test_list_2)):
    # if i < 5052:
    #     continue
    try:
        print(i, des.strip(), com.strip())
        # first word of each user input: des.split()[0]
        delta = pd.value_counts(df_distribution_post["command_1"])[0] - pd.value_counts(df_distribution_post["command_1"]).loc[com.split()[0]]
        # print("delta: ", delta)
        logging.info("running line: ",i)
        if delta > 200:
            # print("num_rep: ", pd.value_counts(df_distribution_post["command_1"]).loc[com.split()[0]])
            # print("inside if statement")
            text = get_response(delta, des, com)
            text_pair = retrieve_text_pair(text)
            write_to_file(text_pair, "output_description_v3.txt", "output_command_v3.txt")
        else:
            continue
        time.sleep(5)
    except Exception as e:
        logging.warn(i, "error: ", str(e))


# for i in (des, com) in enumerate(zip(test_list, test_list_2)):
#     print("i: ", i)
#     print("des: ", des)
#     print("com: ", com)

In [None]:
## NL2Bash Dataset Augmentation:
# Find the delta (rep_max - rep_indexed_command)
# Append delta rows, add linux BASH to respective row
# Use openai.API to generate delta user query

# ----- Uncomment when complete 
# for i in range(len(df_distribution["command_1"])):
# -----

def add_rows(number_rows, linux_bash):
  new_row = {'input' : "test description", 'output' : linux_bash}
  for i in range(number_rows):
    test_df = overview.append(new_row, ignore_index=True)
    print("i: ", i)
  return test_df

for i in range(1):
  first_word = df_distribution["command_1"][i] 
  delta = pd.value_counts(df_distribution["command_1"])[0] - pd.value_counts(df_distribution["command_1"]).loc[first_word]
  print("\nfirst_word: ", first_word)
  print("linux bash: ", overview['output'][i])
  print("delta: ", delta)

  test_df = add_rows(delta, overview['output'][i])

test_df

In [None]:
def read_input_and_process(start_index, input_description, input_command):
    description_file = open(input_description, 'r', encoding="utf8")
    command_file = open(input_command, 'r', encoding="utf8")
    description_list = description_file.readlines()
    command_list = command_file.readlines()
    description_file.close()
    command_file.close()
    for i, (des, com) in enumerate(zip(description_list, command_list)):
        if i < start_index:
            continue
        try:
            print(i, des.strip(), com.strip())
            logging.info("running line: ",i)
            text = get_response(des, com)
            text_pair = retrieve_text_pair(text)
            write_to_file(text_pair, "output_description.txt", "output_command.txt")
            time.sleep(5)
        except Exception as e:
            logging.warn(i, "error: ", str(e))


read_input_and_process(180, "user_input_small.txt", "commands_small.txt")