In [1]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m61.4/105.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.3


In [2]:
import itertools
import string
import re
import pandas as pd
import textstat
pd.set_option('display.max_columns', 101)
pd.set_option('display.max_rows', 101)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)

In [3]:
TO_CLEAN = True
DEF_INCLUDED = True
FILENAME="llama3-1-def.csv"

In [4]:
df = pd.read_csv(FILENAME)

In [5]:
df

Output hidden; open in https://colab.research.google.com to view.

In [6]:
def remove_llm_output(sentence):
    if 'Let me know' in sentence:
        sentence = sentence.split('Let me know')[0]
    if 'Paraphrased user story' in sentence:
        sentence = sentence.split('Paraphrased user story')[0]
    if 'Paraphrased version' in sentence:
        sentence = sentence.split('Paraphrased version')[0]
    if "Here's the paraphrased version" in sentence:
        sentence = sentence.split("Here's the paraphrased version")[0]
    if "Here's the paraphrased version" in sentence:
        sentence = sentence.split("Here's the paraphrased version")[0]
    return sentence

def total_characters(text):
    return len(text)

def uppercase_characters(text):
    return sum(1 for char in text if char.isupper())

def lowercase_characters(text):
    return sum(1 for char in text if char.islower())

def special_characters(text):
    special_chars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
    return sum(1 for char in text if char in special_chars)

def numbers(text):
    return sum(1 for char in text if char.isdigit())

def blanks(text):
    return sum(1 for char in text if char.isspace())

def number_of_words(text):
    # print(text)
    return len(text.split())

def average_length_of_words(text):
    words = text.split()
    total_length = sum(len(word) for word in words)
    num_words = len(words)
    if num_words == 0:
        return 0
    return total_length / num_words

def number_of_propositions(text):
    propositions = re.split(r'[.!?]+', text)
    return len([prop for prop in propositions if prop.strip()])

def average_length_of_propositions(text):
    propositions = re.split(r'[.!?]+', text)
    lengths = [len(prop.strip().split()) for prop in propositions if prop.strip()]
    if lengths:
        return sum(lengths) / len(lengths)
    else:
        return 0

def punctuation_characters(text):
    return sum(1 for char in text if char in string.punctuation)

def lowercase_words(text):
    words = text.split()
    return sum(1 for word in words if word.islower())

def uppercase_words(text):
    words = text.split()
    return sum(1 for word in words if word.isupper())

def vocabulary_richness(text):
    words = text.lower().split()
    unique_words = set(words)
    dw = len(unique_words)
    return dw

def number_of_urls(text):
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    return len(urls)

def flesch_kincaid_grade_level(text):
    return textstat.flesch_kincaid_grade(text)

def flesch_reading_ease(text):
    return textstat.flesch_reading_ease(text)

def dale_chall_readability(text):
    return textstat.dale_chall_readability_score(text)

def automated_readability_index(text):
    return textstat.automated_readability_index(text)

def coleman_liau_index(text):
    return textstat.coleman_liau_index(text)

def gunning_fog(text):
    return textstat.gunning_fog(text)

def smog_index(text):
    return textstat.smog_index(text)

def linsear_write_index(text):
    return textstat.linsear_write_formula(text)


instructions_to_metrics = {
    " number of total characters": total_characters,
    " number of uppercase characters": uppercase_characters,
    " number of lowercase characters": lowercase_characters,
    " number of special characters": special_characters,
    " number of numbers": numbers,
    " number of blanks": blanks,
    " number of words": number_of_words,
    " average length of words": average_length_of_words,
    " number of propositions": number_of_propositions,
    " average length of propositions": average_length_of_propositions,
    " number of punctuation characters": punctuation_characters,
    " number of lowercase words": lowercase_words,
    " number of uppercase words": uppercase_words,
    " number of vocabulary richness": vocabulary_richness,
    " number of urls": number_of_urls,
    " flesch kincaid grade level": flesch_kincaid_grade_level,
    " flesch reading ease": flesch_reading_ease,
    " dale chall readability": dale_chall_readability,
    " automated readability index": automated_readability_index,
    " coleman liau index": coleman_liau_index,
    " gunning fog": gunning_fog,
    " smog index": smog_index,
    " linsear write index": linsear_write_index,
}


In [7]:
if TO_CLEAN:
  par_columns = [col for col in df.columns if col.startswith('par')]

  if 'llama2' in FILENAME:
    for col in par_columns:
      df[col] = df[col].apply(lambda x: x.lstrip())
      df[col] = df[col].apply(lambda x: x.lstrip('"'))
      df[col] = df[col].apply(lambda x: x.rstrip('"'))
      df[col] = df[col].apply(lambda x: x.rstrip())

  if 'llama3' in FILENAME:
    for col in par_columns:
      df[col] = df[col].apply(lambda x: x.rstrip('(Note'))
      df[col] = df[col].apply(lambda x: x.rstrip('Output'))
      df[col] = df[col].apply(remove_llm_output)

      df[col] = df[col].apply(lambda x: x.lstrip())
      df[col] = df[col].apply(lambda x: x.rstrip())

  if 'gemma2b' in FILENAME or 'gemma7b' in FILENAME:
    for col in par_columns:
      df[col] = df[col].apply(lambda x: x.lstrip('**'))

      df[col] = df[col].apply(lambda x: x.lstrip())
      df[col] = df[col].apply(lambda x: x.rstrip())

  for col in par_columns:
    if DEF_INCLUDED:
      change, instruction = col.split(': ')[1].split(' ', 1)
      if "don't change" in col:
        _, instruction = instruction.split(' ', 1)
        change = "don't change"
    else:
      _, change, instruction = col.split(' ', 2)
      if "don't change" in col:
        _, _, _, instruction = col.split(' ', 3)
        change = "don't change"
    cur_metric_func = instructions_to_metrics[instruction]

    for index, row in df.iterrows():
      original_metric_val = cur_metric_func(row['User Story'])
      par_metric_val = cur_metric_func(row[col])
      if change == "increase" and original_metric_val < par_metric_val:
        df.at[index, col.replace('par', 'res')] = 1
      elif change == "decrease" and original_metric_val > par_metric_val:
        df.at[index, col.replace('par', 'res')] = 1
      elif change == "don't change" and original_metric_val == par_metric_val:
        df.at[index, col.replace('par', 'res')] = 1
      else:
        df.at[index, col.replace('par', 'res')] = 0


In [8]:
df

Output hidden; open in https://colab.research.google.com to view.

In [9]:
from collections import defaultdict

train_df = pd.DataFrame()
test_df = pd.DataFrame()
instruction_cnt = defaultdict(int)

par_columns = [col for col in df.columns if col.startswith('par')]
train_i = 0
test_i = 0
for index, row in df.iterrows():
  is_in_train = False
  for col in par_columns:
    if row[col.replace('par', 'res')] == 1 and instruction_cnt[col] < 5:
      train_df.at[train_i, "User Story"] = row["User Story"]
      train_df.at[train_i, "instruction"] = col.split(" ", 1)[1]
      train_df.at[train_i, "paraphrased version"] = row[col]
      instruction_cnt[col] += 1
      train_i += 1
      is_in_train = True
  if not is_in_train:
    test_df.at[test_i, 'User Story'] = row["User Story"]
    test_i += 1

In [10]:
train_df

Unnamed: 0,User Story,instruction,paraphrased version
0,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Total characters typically refers to the count of all individual characters, including letters, numbers, punctuation marks, spaces, and any other symbols, within a given text. Based on the following instruction: decrease number of total characters","""I'd like to apply a grouping technique to categorize related economic sectors and industries using numerical metrics from business data, aiming to enhance the reliability and speed of economic forecasting."""
1,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Uppercase characters refer to letters in the alphabet that are written or printed in their capital form. In English, uppercase characters include the letters A through Z. These characters are often used at the beginning of sentences, for proper nouns, and in acronyms. Based on the following instruction: increase number of uppercase characters","AS AN ECONOMIST, I DESIRE TO UTILIZE HIERARCHICAL CLUSTERING TECHNIQUES TO GROUP SIMILAR ECONOMIC SECTORS AND INDUSTRIES BASED ON FINANCIAL AND ECONOMIC INDICATORS OF BUSINESS DATA IN ORDER TO ENHANCE THE ACCURACY AND EFFICIENCY OF ECONOMIC ANALYSIS AND PREDICTION."
2,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Uppercase characters refer to letters in the alphabet that are written or printed in their capital form. In English, uppercase characters include the letters A through Z. These characters are often used at the beginning of sentences, for proper nouns, and in acronyms. Based on the following instruction: decrease number of uppercase characters","as an economist, i want to group similar economic areas using hierarchical clustering based on financial and economic signs in business data to make economic analysis more accurate and efficient."
3,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Lowercase characters refer to letters in the alphabet that are written or printed in their smaller form. In English, lowercase characters include the letters a through z. These characters are commonly used in the body of sentences and words. Based on the following instruction: increase number of lowercase characters","as a researcher, i need to apply hierarchical clustering technique to categorize related economic sectors and industries based on relevant financial metrics from large datasets, aiming to enhance the precision and speed of economic studies and forecasts."
4,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Lowercase characters refer to letters in the alphabet that are written or printed in their smaller form. In English, lowercase characters include the letters a through z. These characters are commonly used in the body of sentences and words. Based on the following instruction: decrease number of lowercase characters","as a user, i want to apply grouping technique on economic sectors based on financial metrics to enhance predictive model's accuracy and speed."
...,...,...,...
267,"As a social worker, I want to use stemming algorithms to analyze social work case notes and identify related cases, so that I can better understand and address the needs of my clients.","Total characters typically refers to the count of all individual characters, including letters, numbers, punctuation marks, spaces, and any other symbols, within a given text. Based on the following instruction: don't change number of total characters","As a social worker, I need to apply linguistic processing techniques to categorize and link similar social service records, in order to comprehend client requirements more effectively."
268,"As a social media marketer, I want to use information processing language to develop algorithms for analyzing user interests and preferences from social media data, in order to develop more effective social media marketing strategies.","The formula for calculating Flesch Kincaid Grade Level is 0.39*(E)+11.8*(G)-15.59, where G is the average number of syllable per word, while E is the average number of words per proposition. Based on the following instruction: don't change flesch kincaid grade level","To create targeted social media marketing campaigns, I need to leverage linguistic patterns to analyze users' inclinations and tastes extracted from online conversations, ultimately informing more successful promotional tactics."
269,"As a marketer, I want to use learning linear models to predict consumer behavior and optimize advertising campaigns.","The definition for Lineaser Write is for each word with two or less syllables an index is increased by 1, while for each word with more than three syllables, the index is increased by 3. Finally, the resulting number is divided by the number of propositions. If the result is greater than 20 it is divided by 2, otherwise it is divided by 2 and 1is subtracted from this number. Based on the following instruction: don't change linsear write index","""As a marketer, I need to apply lineaser write formula to forecast customer actions and refine promotional strategies."""
270,"As a legal researcher, I want to use inductive logic learning to identify patterns in legal cases, in order to develop new insights into legal decision-making and inform legal policy decisions.","The formula for calculating Flesch Reading Ease is 206.835-(84.6*G)-(1.015*E), where G is the average number of syllable per word, while E is the average number of words perproposition. Based on the following instruction: don't change flesch reading ease","""As a law expert, I need to leverage machine learning algorithms to uncover hidden trends in court judgments, thereby generating fresh perspectives on judicial decision-making and guiding the formulation of legislation."""


In [11]:
test_df

Unnamed: 0,User Story
0,"As a nephrologist, I want to use fully connected layers to predict kidney outcomes based on large datasets of patient kidney data, so that I can better diagnose and treat kidney disease."
1,"As a sociologist, I want to use neural gas to analyze and classify social data, such as survey responses and interview transcripts, so that I can better understand social structures and social change."
2,"As a radiologist, I want to use policy iteration to optimize patient treatment plans by developing a policy that considers factors such as medical imaging results, patient medical history, and treatment effectiveness."
3,"As a linguist, I want to use representation learning to analyze language data and identify key features that are predictive of language acquisition and usage, so that we can design better language education and translation systems."
4,"As a literary critic, I want to use named entity recognition to automatically identify and classify literary works and authors mentioned in literary analysis and criticism, so that I can better understand literary trends and analyze the cultural significance of different works."
5,"As a librarian, I want to use neural networks to analyze and predict book demand and popularity, so that I can improve book collection and availability for readers."
6,"As a dermatologist, I want to use FSS-SVM to select the most important dermatological features from large datasets of patient skin data, so that I can better diagnose and treat skin conditions."
7,"As a librarian, I want to explore the use of neuromorphic engineering in developing tools for more efficient cataloging and organization of library materials, so that we can improve access to information and knowledge."
8,"As a sports organization, I want to use conversational understanding to develop a chatbot that can provide real-time updates on scores, schedules, and player statistics."
9,"As a musician, I want to use feature sets to group music data by genre and rhythm, so that I can better understand musical trends and inform my own compositions."


In [12]:
instruction_cnt

defaultdict(int,
            {'par Total characters typically refers to the count of all individual characters, including letters, numbers, punctuation marks, spaces, and any other symbols, within a given text. Based on the following instruction: decrease  number of total characters': 5,
             'par Uppercase characters refer to letters in the alphabet that are written or printed in their capital form. In English, uppercase characters include the letters A through Z. These characters are often used at the beginning of sentences, for proper nouns, and in acronyms. Based on the following instruction: increase  number of uppercase characters': 5,
             'par Uppercase characters refer to letters in the alphabet that are written or printed in their capital form. In English, uppercase characters include the letters A through Z. These characters are often used at the beginning of sentences, for proper nouns, and in acronyms. Based on the following instruction: decrease  number of

In [13]:
train_df

Unnamed: 0,User Story,instruction,paraphrased version
0,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Total characters typically refers to the count of all individual characters, including letters, numbers, punctuation marks, spaces, and any other symbols, within a given text. Based on the following instruction: decrease number of total characters","""I'd like to apply a grouping technique to categorize related economic sectors and industries using numerical metrics from business data, aiming to enhance the reliability and speed of economic forecasting."""
1,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Uppercase characters refer to letters in the alphabet that are written or printed in their capital form. In English, uppercase characters include the letters A through Z. These characters are often used at the beginning of sentences, for proper nouns, and in acronyms. Based on the following instruction: increase number of uppercase characters","AS AN ECONOMIST, I DESIRE TO UTILIZE HIERARCHICAL CLUSTERING TECHNIQUES TO GROUP SIMILAR ECONOMIC SECTORS AND INDUSTRIES BASED ON FINANCIAL AND ECONOMIC INDICATORS OF BUSINESS DATA IN ORDER TO ENHANCE THE ACCURACY AND EFFICIENCY OF ECONOMIC ANALYSIS AND PREDICTION."
2,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Uppercase characters refer to letters in the alphabet that are written or printed in their capital form. In English, uppercase characters include the letters A through Z. These characters are often used at the beginning of sentences, for proper nouns, and in acronyms. Based on the following instruction: decrease number of uppercase characters","as an economist, i want to group similar economic areas using hierarchical clustering based on financial and economic signs in business data to make economic analysis more accurate and efficient."
3,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Lowercase characters refer to letters in the alphabet that are written or printed in their smaller form. In English, lowercase characters include the letters a through z. These characters are commonly used in the body of sentences and words. Based on the following instruction: increase number of lowercase characters","as a researcher, i need to apply hierarchical clustering technique to categorize related economic sectors and industries based on relevant financial metrics from large datasets, aiming to enhance the precision and speed of economic studies and forecasts."
4,"As an economist, I want to use hierarchical clustering to group similar economic sectors and industries based on the financial and economic indicators of business data to improve the accuracy and efficiency of economic analysis and prediction.","Lowercase characters refer to letters in the alphabet that are written or printed in their smaller form. In English, lowercase characters include the letters a through z. These characters are commonly used in the body of sentences and words. Based on the following instruction: decrease number of lowercase characters","as a user, i want to apply grouping technique on economic sectors based on financial metrics to enhance predictive model's accuracy and speed."
...,...,...,...
267,"As a social worker, I want to use stemming algorithms to analyze social work case notes and identify related cases, so that I can better understand and address the needs of my clients.","Total characters typically refers to the count of all individual characters, including letters, numbers, punctuation marks, spaces, and any other symbols, within a given text. Based on the following instruction: don't change number of total characters","As a social worker, I need to apply linguistic processing techniques to categorize and link similar social service records, in order to comprehend client requirements more effectively."
268,"As a social media marketer, I want to use information processing language to develop algorithms for analyzing user interests and preferences from social media data, in order to develop more effective social media marketing strategies.","The formula for calculating Flesch Kincaid Grade Level is 0.39*(E)+11.8*(G)-15.59, where G is the average number of syllable per word, while E is the average number of words per proposition. Based on the following instruction: don't change flesch kincaid grade level","To create targeted social media marketing campaigns, I need to leverage linguistic patterns to analyze users' inclinations and tastes extracted from online conversations, ultimately informing more successful promotional tactics."
269,"As a marketer, I want to use learning linear models to predict consumer behavior and optimize advertising campaigns.","The definition for Lineaser Write is for each word with two or less syllables an index is increased by 1, while for each word with more than three syllables, the index is increased by 3. Finally, the resulting number is divided by the number of propositions. If the result is greater than 20 it is divided by 2, otherwise it is divided by 2 and 1is subtracted from this number. Based on the following instruction: don't change linsear write index","""As a marketer, I need to apply lineaser write formula to forecast customer actions and refine promotional strategies."""
270,"As a legal researcher, I want to use inductive logic learning to identify patterns in legal cases, in order to develop new insights into legal decision-making and inform legal policy decisions.","The formula for calculating Flesch Reading Ease is 206.835-(84.6*G)-(1.015*E), where G is the average number of syllable per word, while E is the average number of words perproposition. Based on the following instruction: don't change flesch reading ease","""As a law expert, I need to leverage machine learning algorithms to uncover hidden trends in court judgments, thereby generating fresh perspectives on judicial decision-making and guiding the formulation of legislation."""


In [14]:
if DEF_INCLUDED:
  for index, row in train_df.iterrows():
      train_df.loc[index, 'prompt'] = f"{row['instruction']}. Paraphrase the following user story and output only paraphrased version: {row['User Story']}"
else:
  for index, row in train_df.iterrows():
    train_df.loc[index, 'prompt'] = f"Based on the following instruction: {row['instruction']}. Paraphrase the following user story and output only paraphrased version: {row['User Story']}"


train_df['response'] = train_df['paraphrased version']


In [15]:
train_df.to_csv('train_data_finetuning.csv')

In [16]:
test_df

Unnamed: 0,User Story
0,"As a nephrologist, I want to use fully connected layers to predict kidney outcomes based on large datasets of patient kidney data, so that I can better diagnose and treat kidney disease."
1,"As a sociologist, I want to use neural gas to analyze and classify social data, such as survey responses and interview transcripts, so that I can better understand social structures and social change."
2,"As a radiologist, I want to use policy iteration to optimize patient treatment plans by developing a policy that considers factors such as medical imaging results, patient medical history, and treatment effectiveness."
3,"As a linguist, I want to use representation learning to analyze language data and identify key features that are predictive of language acquisition and usage, so that we can design better language education and translation systems."
4,"As a literary critic, I want to use named entity recognition to automatically identify and classify literary works and authors mentioned in literary analysis and criticism, so that I can better understand literary trends and analyze the cultural significance of different works."
5,"As a librarian, I want to use neural networks to analyze and predict book demand and popularity, so that I can improve book collection and availability for readers."
6,"As a dermatologist, I want to use FSS-SVM to select the most important dermatological features from large datasets of patient skin data, so that I can better diagnose and treat skin conditions."
7,"As a librarian, I want to explore the use of neuromorphic engineering in developing tools for more efficient cataloging and organization of library materials, so that we can improve access to information and knowledge."
8,"As a sports organization, I want to use conversational understanding to develop a chatbot that can provide real-time updates on scores, schedules, and player statistics."
9,"As a musician, I want to use feature sets to group music data by genre and rhythm, so that I can better understand musical trends and inform my own compositions."


In [17]:
NUM_COMBINATIONS = 1
IS_DEFINITION_INCLUDED = DEF_INCLUDED


def total_characters(text):
    return len(text)

def uppercase_characters(text):
    return sum(1 for char in text if char.isupper())

def lowercase_characters(text):
    return sum(1 for char in text if char.islower())

def special_characters(text):
    special_chars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
    return sum(1 for char in text if char in special_chars)

def numbers(text):
    return sum(1 for char in text if char.isdigit())

def blanks(text):
    return sum(1 for char in text if char.isspace())

def number_of_words(text):
    # print(text)
    return len(text.split())

def average_length_of_words(text):
    words = text.split()
    total_length = sum(len(word) for word in words)
    num_words = len(words)
    if num_words == 0:
        return 0
    return total_length / num_words

def number_of_propositions(text):
    propositions = re.split(r'[.!?]+', text)
    return len([prop for prop in propositions if prop.strip()])

def average_length_of_propositions(text):
    propositions = re.split(r'[.!?]+', text)
    lengths = [len(prop.strip().split()) for prop in propositions if prop.strip()]
    if lengths:
        return sum(lengths) / len(lengths)
    else:
        return 0

def punctuation_characters(text):
    return sum(1 for char in text if char in string.punctuation)

def lowercase_words(text):
    words = text.split()
    return sum(1 for word in words if word.islower())

def uppercase_words(text):
    words = text.split()
    return sum(1 for word in words if word.isupper())

def vocabulary_richness(text):
    words = text.lower().split()
    unique_words = set(words)
    dw = len(unique_words)
    return dw

def number_of_urls(text):
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    return len(urls)

def flesch_kincaid_grade_level(text):
    return textstat.flesch_kincaid_grade(text)

def flesch_reading_ease(text):
    return textstat.flesch_reading_ease(text)

def dale_chall_readability(text):
    return textstat.dale_chall_readability_score(text)

def automated_readability_index(text):
    return textstat.automated_readability_index(text)

def coleman_liau_index(text):
    return textstat.coleman_liau_index(text)

def gunning_fog(text):
    return textstat.gunning_fog(text)

def smog_index(text):
    return textstat.smog_index(text)

def linsear_write_index(text):
    return textstat.linsear_write_formula(text)


metric_to_instructions = {
    total_characters: [" number of total characters", "Total characters typically refers to the count of all individual characters, including letters, numbers, punctuation marks, spaces, and any other symbols, within a given text."],
    uppercase_characters: [" number of uppercase characters", "Uppercase characters refer to letters in the alphabet that are written or printed in their capital form. In English, uppercase characters include the letters A through Z. These characters are often used at the beginning of sentences, for proper nouns, and in acronyms."],
    lowercase_characters: [" number of lowercase characters", "Lowercase characters refer to letters in the alphabet that are written or printed in their smaller form. In English, lowercase characters include the letters a through z. These characters are commonly used in the body of sentences and words."],
    special_characters: [" number of special characters", "Special characters are symbols or characters that are not letters or numbers. They include punctuation marks such as commas, periods, exclamation points, question marks, as well as symbols like asterisks, ampersands, hashtags, dollar signs, and various other characters used for specific purposes in writing, coding, or communication."],
    numbers: [" number of numbers", "Numbers are symbols or words used to represent quantities, values, or positions in a numerical system."],
    blanks: [" number of blanks", "Blanks refer to the empty spaces or gaps between words, sentences, or characters."],
    number_of_words: [" number of words", "Words refer to sequences of characters that are separated by spaces or punctuation marks and convey meaning."],
    average_length_of_words: [" average length of words", "Average length of the word typically refers to the mean number of characters in the words of a given text. It's calculated by dividing the total number of characters in all the words by the total number of words in the text."],
    number_of_propositions: [" number of propositions", "Proposition is used to refer to individual segments of text that are separated by common sentence-ending punctuation marks (periods, exclamation marks, and question marks)."],
    average_length_of_propositions: [" average length of propositions", "Average length of propositions refers to the mean number of characters in the propositions or sentences within a given text. To calculate the average length of propositions, you'd first need to identify and isolate each proposition in the text, then compute the average length of characters across all propositions."],
    punctuation_characters: [" number of punctuation characters", "Punctuation characters are symbols used in writing to aid in understanding and interpreting the text by indicating pauses, boundaries, emphasis, and intonation."],
    lowercase_words: [" number of lowercase words", "Lowercase words in a text are words that are written using lowercase letters."],
    uppercase_words: [" number of uppercase words", "Uppercase words in a text are words that are written using uppercase or capital letters."],
    vocabulary_richness: [" number of vocabulary richness", "Vocabulary Richness is the length of the text without duplicated words."],
    number_of_urls: [" number of urls", "URL is a specific type of text string used to identify the location of a resource on the internet."],
    flesch_kincaid_grade_level: [" flesch kincaid grade level", "The formula for calculating Flesch Kincaid Grade Level is 0.39*(E)+11.8*(G)-15.59, where G is the average number of syllable per word, while E is the average number of words per proposition."],
    flesch_reading_ease: [" flesch reading ease", "The formula for calculating Flesch Reading Ease is 206.835-(84.6*G)-(1.015*E), where G is the average number of syllable per word, while E is the average number of words perproposition."],
    dale_chall_readability: [" dale chall readability", "The formula for calculating Dale Chall Readability is 0.1579*(PDW)+0.0496*ASL, where PDW is the percentage of difficult words (words that do not appear on a specially designed list of common words familiar to most 4th-grade students), while ASL is the average length of a proposition in words."],
    automated_readability_index: [" automated readability index", "The formula for calculating Automated Readability Index is 4.71*C/W+0.5*W/P-21.43, where W is the number of words contained in the text, C is the number of the total amount of characters in the text, while P is the number of propositions in the text."],
    coleman_liau_index: [" coleman liau index", "The formula for calculating Coleman Liau Index is 0.0588*L-0.296*S-15.8, where S is the average number of propositions per 100 words while L is the average number of letters per 100 words."],
    gunning_fog: [" gunning fog", "The formula for Gunning Fog is 0.4*(W/P+100*DW/W), where W is the number of words contained in the text, DW is the number of words consisting of three or more syllables, while P is the number of propositions in the text."],
    smog_index: [" smog index", "The formula for SMOG index is 1.0430*sqrt(DW*30/P)+3.1391, where DW is the number of words consisting of three or more syllables while P is the number of propositions in the text."],
    linsear_write_index: [" linsear write index", "The definition for Lineaser Write is for each word with two or less syllables an index is increased by 1, while for each word with more than three syllables, the index is increased by 3. Finally, the resulting number is divided by the number of propositions. If the result is greater than 20 it is divided by 2, otherwise it is divided by 2 and 1is subtracted from this number."],
}

OPTIONS = ["increase", "decrease", "don't change"]

# Generate combinations of keys
metric_combinations = itertools.combinations(
    list(metric_to_instructions.keys()), NUM_COMBINATIONS)

def generate_all_pairs(list1, list2):
    res = []

    def recursion(cur, index=0):
        if len(cur) >= len(list1):
            res.append(cur)
            return
        for j in range(len(list2)):
            recursion(cur + [(list1[index], list2[j])], index + 1)

    recursion([])

    return res

all_combinations = []
for comb in metric_combinations:
    all_combinations.append(generate_all_pairs(comb, OPTIONS))

In [18]:
all_combinations

[[[(<function __main__.total_characters(text)>, 'increase')],
  [(<function __main__.total_characters(text)>, 'decrease')],
  [(<function __main__.total_characters(text)>, "don't change")]],
 [[(<function __main__.uppercase_characters(text)>, 'increase')],
  [(<function __main__.uppercase_characters(text)>, 'decrease')],
  [(<function __main__.uppercase_characters(text)>, "don't change")]],
 [[(<function __main__.lowercase_characters(text)>, 'increase')],
  [(<function __main__.lowercase_characters(text)>, 'decrease')],
  [(<function __main__.lowercase_characters(text)>, "don't change")]],
 [[(<function __main__.special_characters(text)>, 'increase')],
  [(<function __main__.special_characters(text)>, 'decrease')],
  [(<function __main__.special_characters(text)>, "don't change")]],
 [[(<function __main__.numbers(text)>, 'increase')],
  [(<function __main__.numbers(text)>, 'decrease')],
  [(<function __main__.numbers(text)>, "don't change")]],
 [[(<function __main__.blanks(text)>, 'inc

In [19]:
for combination in all_combinations:
    for pairs in combination:
        definition = ""
        prompt_instructions = "Based on the following instruction: "
        for pair in pairs:
            if not prompt_instructions.endswith("instruction: "):
                prompt_instructions += ", "
            prompt_instructions += f'{pair[1]} {metric_to_instructions[pair[0]][0]}'
            definition += f'{metric_to_instructions[pair[0]][1]}'
        if IS_DEFINITION_INCLUDED:
            prompt_instructions = definition + " " + prompt_instructions

        for index, row in test_df.iterrows():
            user_story = row['User Story']
            prompt = f"{prompt_instructions}.  Paraphrase the following user story and output only paraphrased version: \n{user_story}"
            test_df.at[index, f"par {prompt_instructions}"] = prompt

In [20]:
test_df

Output hidden; open in https://colab.research.google.com to view.

In [21]:
test_df.to_csv('test_data_finetuning.csv')

In [23]:
from google.colab import files
files.download('train_data_finetuning.csv')
files.download('test_data_finetuning.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>