In [1]:
! pip install openai
import dataclasses
import logging
import math
import ast
import re
import os
import io
import sys
import time
import json
import tqdm
import copy
import pandas as pd

from typing import Optional, Sequence, Union
from cleantext import clean
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from openai import OpenAI



Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


### OpenAI API key
To use an OpenAI model, you'll need an OpenAI key. Enter your OpenAI API key a .env file in the `~/pykoi` directory with the name OPEN_API_KEY, e.g.
```
OPENAI_API_KEY=your_api_key
```

In the next cell, we load the key from the .env file.

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

## Load data 

In [3]:
# current_directory = os.getcwd()
qaa = pd.read_csv("raw_input/qd_immigration.csv", encoding = "utf8") # LLMs usually requires input text data with character encoding utf8
qaa.head()

Unnamed: 0.1,Unnamed: 0,ID,Question,Answer,Vote Status,Timestamp
0,0,1,H-1B to EB-2 process . I would like to know If...,The employer can start the green card process ...,up,
1,1,2,Eligibility for and the process of EB-3 to EB-...,You can always go up and you can always go dow...,up,
2,2,3,How can I qualify for EB-1C/International Mana...,1. Please discuss your job description that is...,up,
3,3,4,Downgrading from EB-2 to EB-3 . My wife and I ...,I see no problem applying for EB-3 and then us...,up,
4,4,5,"How to get H-1B approved for three years, not ...",1. Typically the only way you can get three ye...,up,


#### Check the number of missing values

In [4]:
print(qaa.isna().sum())

Unnamed: 0      0
ID              0
Question        0
Answer          0
Vote Status     0
Timestamp      96
dtype: int64


## Data preprocessing

#### Stop words removal and stemming (Optional)

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/joseortiz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joseortiz/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
stop = stopwords.words('english')
# These words are important for the problem. Exclude them from the stop words.
excluding = ['against', 'not', 'don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't",
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren',
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]
# New stop word list
stop_words = [word for word in stop if word not in excluding]

snow = SnowballStemmer('english')

def process_text(texts):
    final_text_list=[]
    for sent in texts:

        # Check if the sentence is a missing value
        if isinstance(sent, str) == False:
            sent = ""

        filtered_sentence=[]

        sent = sent.lower() # Lowercase
        sent = sent.strip() # Remove leading/trailing whitespace
        sent = re.sub('\s+', ' ', sent) # Remove extra space and tabs
        sent = re.compile('<.*?>').sub('', sent) # Remove HTML tags/markups:

        for w in word_tokenize(sent):
            # We are applying some custom filtering here, feel free to try different things
            # Check if it is not numeric and its length>2 and not in stop words
            if(not w.isnumeric()) and (len(w)>2) and (w not in stop_words):
                # Stem and add to filtered list
                filtered_sentence.append(snow.stem(w))
        final_string = " ".join(filtered_sentence) #final string of cleaned words

        final_text_list.append(final_string)

    return final_text_list

stop_stem=False
if stop_stem:
    question_l_raw, answer_l_raw = qaa["Question"].to_list(), qaa["Answer"].to_list()
    qaa["Question"] = process_text(question_l_raw)
    qaa["Answer"] = process_text(answer_l_raw)

#### Personal Identification Information (PII) removal and other preprocessing using cleantext

In [7]:
! pip install clean-text
f_clean = lambda qaa_pair_raw : clean(qaa_pair_raw,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"                       # set to 'de' for German special handling
)
#import pdb; pdb.set_trace()
question_l_raw = qaa["Question"].to_list()
question_l = [f_clean(p) for p in question_l_raw]

answer_l_raw = qaa["Answer"].to_list()
answer_l =  [f_clean(p) for p in answer_l_raw]


qaa["Question"] = question_l
qaa["Answer"] = answer_l




#### Dataset customized cleanup

In [8]:
answer_l_raw = qaa["Answer"].to_list()
qaa["Answer"] = [re.compile(r'<.*?>|More\.\.\.', flags=re.IGNORECASE).sub('', p) for p in answer_l_raw] # Remove HTML tags/markups:

#### Data structure conversion

In [9]:
qaa_list = qaa[["Question", "Answer"]].to_dict('records')
print(len(qaa_list))
print(qaa_list[:10])

96
[{'Question': 'h-1b to eb-2 process . i would like to know if i have an advanced degree (masters engineering management) and my employer filed my h-1b and if the lottery is picked can i initiate the green card process? i also heard that there is a minimum salary cap for eb-2 advanced degree.', 'Answer': 'the employer can start the green card process at any time, even before you join. to see the salary figures by county and profession, you can review this link -https://www.flcdatacenter.com/'}, {'Question': 'eligibility for and the process of eb-3 to eb-2 porting . can you talk about this upgrade process from eb3 to eb2 for pending i-485? does it need another i-485 application or just a letter to uscis?', 'Answer': "you can always go up and you can always go down as long as your perm was filed as an eb-2. if you filed your prm as an eb-3 then you're not going to be able to upgrade to eb-2. but if your perm was filed as an eb-2 you can downgrade i-140 eb-3. you can go back upgrade toe

### Prompt Engineering

In [10]:
prompts= """Paraphrase the below question and answer pair in 3 different ways.
Try not to repeat the verb for each pair to maximize diversity.
Return everything in an array of JSON object in this format: ######{"_question":"string", "_answer":"string"}
Seperate each pair with "######" rather than commas.
"""
print(len(prompts))

294


In [11]:
def encode_prompt_QA(prompts=prompts, QA_list=[]):
    """Encode multiple prompt instructions into a single string."""
    listof_prompt_QA = []
    for idx, task_dict in enumerate(QA_list):
        single_prompt_QA = prompts+"\n"
        (question, answer) = task_dict["Question"], task_dict["Answer"]
        question = re.sub(r"\s+", " ", question).strip().rstrip(":")
        answer = "<noinput>" if answer.lower() == "" else answer
        single_prompt_QA += f"######\n"
        single_prompt_QA += "{"
        single_prompt_QA += f"\"_question\": \"{question}\",\n"
        single_prompt_QA += f"\"_answer\": \"{answer}\""
        single_prompt_QA += "}"
        # single_prompt_QA += f"###\n"
        listof_prompt_QA.append(single_prompt_QA)
    return listof_prompt_QA

qaa_list_encoded = encode_prompt_QA(prompts, qaa_list)
print("len(qaa_list_encoded): ", len(qaa_list_encoded))
for id, qa in enumerate(qaa_list_encoded[:10]):
    print(qa)
    print("\n")

len(qaa_list_encoded):  96
Paraphrase the below question and answer pair in 3 different ways.
Try not to repeat the verb for each pair to maximize diversity.
Return everything in an array of JSON object in this format: ######{"_question":"string", "_answer":"string"}
Seperate each pair with "######" rather than commas.

######
{"_question": "h-1b to eb-2 process . i would like to know if i have an advanced degree (masters engineering management) and my employer filed my h-1b and if the lottery is picked can i initiate the green card process? i also heard that there is a minimum salary cap for eb-2 advanced degree.",
"_answer": "the employer can start the green card process at any time, even before you join. to see the salary figures by county and profession, you can review this link -https://www.flcdatacenter.com/"}


Paraphrase the below question and answer pair in 3 different ways.
Try not to repeat the verb for each pair to maximize diversity.
Return everything in an array of JSON o

#### Generate synthetic data with low temparature

In [12]:
qaa_augmented_raw = []

In [14]:
client = OpenAI()
for id, batch_inputs_string in enumerate(qaa_list_encoded):
    print(id, "\n") # "\n\n\n", "input:\n", batch_inputs_string, "\n"
    completion_batch = client.completions.create(
        prompt=batch_inputs_string,
        model="text-davinci-003",
        temperature = 0.2,
        max_tokens = 1000 # The maximum number of tokens to generate in the completion
        ).model_dump()
    results_string = completion_batch['choices'][0]['text']
    print(results_string, "\n\n\n")
    qaa_augmented_raw.append(results_string)

0 



######
{"_question": "Can I initiate the green card process if my employer filed my H-1B and it was selected in the lottery?",
"_answer": "Yes, the employer can begin the green card process before you join. To check the salary requirements for EB-2 advanced degree holders, you can look at this link -https://www.flcdatacenter.com/"}

######
{"_question": "If my employer submitted an H-1B for me and it was chosen in the lottery, can I start the green card process?",
"_answer": "Yes, the employer can initiate the green card process before you join. To view the salary caps for EB-2 advanced degree holders, you can refer to this link -https://www.flcdatacenter.com/"}

######
{"_question": "If my employer filed an H-1B for me and it was picked in the lottery, can I initiate the green card process?",
"_answer": "Yes, the employer can initiate the green card process before you join. To check the salary requirements for EB-2 advanced degree holders, you can look at this link -https://www.

#### Generate synthetic data with high temperature

In [15]:
qaa_augmented_raw = []
for id, batch_inputs_string in enumerate(qaa_list_encoded):
    print(id, "\n") # "\n\n\n", "input:\n", batch_inputs_string, "\n"
    completion_batch = client.completions.create(
        prompt=batch_inputs_string,
        model="text-davinci-003",
        temperature = 0.5,
        max_tokens = 2000 # The maximum number of tokens to generate in the completion
        ).model_dump()
    results_string = completion_batch['choices'][0]['text']
    print(results_string, "\n\n\n")
    qaa_augmented_raw.append(results_string)

0 



######
{"_question": "Can I begin the green card process if my H-1B is picked in the lottery and I have a master's degree in engineering management?",
"_answer": "Yes, your employer can initiate the green card process even before you start working. You can check the salary requirements for EB-2 advanced degree holders by county and profession by visiting this website - https://www.flcdatacenter.com/"}

######
{"_question": "If I have a master's degree in engineering management and my employer files my H-1B, can I start the green card process if it is selected in the lottery?",
"_answer": "Yes, the green card process can be initiated by the employer before you join. You can view the salary cap for EB-2 advanced degree holders by county and profession by using this link - https://www.flcdatacenter.com/"}

######
{"_question": "If I have a master's degree in engineering management and my employer submits my H-1B, can I initiate the green card process if it is chosen in the lottery?"

### dump raw output to a file


In [16]:
def _make_w_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f_dirname = os.path.dirname(f)
        if f_dirname != "":
            os.makedirs(f_dirname, exist_ok=True)
        f = open(f, mode=mode)
    return f

def jdump(obj, f, mode="w", indent=4, default=str):
    """Dump a str or dictionary to a file in json format.

    Args:
        obj: An object to be written.
        f: A string path to the location on disk.
        mode: Mode for opening the file.
        indent: Indent for storing json dictionaries.
        default: A function to handle non-serializable entries; defaults to `str`.
    """
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
        json.dump(obj, f, indent=indent, default=default)
    elif isinstance(obj, str):
        f.write(obj)
    else:
        raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()

In [17]:
output_name = "output_raw_96.json"
output_dir="./raw_input"
jdump(qaa_augmented_raw, os.path.join(output_dir, output_name))

### Read json file

In [18]:
def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f

def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

output_raw = jload("raw_input/output_raw_96.json")
#import pdb; pdb.set_trace()
output_raw[0]

'\n\n######\n{"_question": "Can I begin the green card process if my H-1B is picked in the lottery and I have a master\'s degree in engineering management?",\n"_answer": "Yes, your employer can initiate the green card process even before you start working. You can check the salary requirements for EB-2 advanced degree holders by county and profession by visiting this website - https://www.flcdatacenter.com/"}\n\n######\n{"_question": "If I have a master\'s degree in engineering management and my employer files my H-1B, can I start the green card process if it is selected in the lottery?",\n"_answer": "Yes, the green card process can be initiated by the employer before you join. You can view the salary cap for EB-2 advanced degree holders by county and profession by using this link - https://www.flcdatacenter.com/"}\n\n######\n{"_question": "If I have a master\'s degree in engineering management and my employer submits my H-1B, can I initiate the green card process if it is chosen in th

### Postprocess the Raw Outputs

In [19]:
QApair_dict = []
error_list = []
for raw in qaa_augmented_raw:
    pairs = raw.split("######")
    for id, item in enumerate(pairs):
        # print(id)
        if not item.isspace():
            try:
                str2dict = ast.literal_eval(item)
                QApair_dict.append(str2dict)
            except Exception as e:
                print(e)
                print(item)
                print("\n\n\n")
                error_list.append(item)
                # print(type(str2dict))
                # print(str2dict['_question'])
                # print(str2dict['_answer'])
print(len(QApair_dict))
# QApair_dict

unterminated string literal (detected at line 3) (<unknown>, line 3)

{"_question": "What is the educational criteria for EB-2?",
"_answer": "In June 2007, USCIS clarified what is considered to be equivalent to a U.S. master's degree for employment-based category 2. Each petition and its supporting documentation are examined on a case-by-case basis and degree equivalencies are based on the evidence presented with the individual case. However, the below is provided as a general outline: 1. U.S. master's degree as long as it is in the field required, no additional document is required. 2. Four-year bachelor's degree + two-year master's degree (India) with degrees in the same or related fields, this will generally be considered the equivalent to a U.S. master's degree and no additional document is required. 3. Three-year bachelor's degree + three-year master's degree (India) with degrees in the same or related fields, this will generally be equivalent to a U.S. master's degree and no addi

In [20]:
QApair_df = pd.DataFrame(QApair_dict)
QApair_df

Unnamed: 0,_question,_answer
0,Can I begin the green card process if my H-1B ...,"Yes, your employer can initiate the green card..."
1,If I have a master's degree in engineering man...,"Yes, the green card process can be initiated b..."
2,If I have a master's degree in engineering man...,"Yes, the employer can begin the green card pro..."
3,What is the eligibility and process for upgrad...,It is possible to upgrade from EB-3 to EB-2 st...
4,Can you explain the process for changing from ...,"If the PERM was originally filed as an EB-2, i..."
...,...,...
268,What are the chances of my wife obtaining EB2 ...,Occupational therapists do not have a special ...
269,Is it possible for my wife to get EB2 processi...,Occupational therapists do not have a separate...
270,Do I have the qualifications to apply for an E...,"Yes, you should have the qualifications to app..."
271,If I have 3 years of IT experience and am work...,"Yes, you should be able to apply for an EB-2 g..."


In [22]:
QApair_df.to_csv("output/output_qa_augment.csv", index=False)