In [5]:
# Imports
import spacy
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [None]:
!python -m spacy download en_core_web_sm

In [4]:
nlp = spacy.load('en_core_web_sm')
word2vec_model = api.load('word2vec-google-news-300')



In [27]:
def parsed_feature():
    scenarios = []
    current_scenario = []
    scenario_start = False

    with open("../data/Calculator.feature", 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith("Scenario:"):
                if scenario_start:
                    scenarios.append("\n".join(current_scenario))
                current_scenario = [line]
                scenario_start = True
            elif scenario_start:
                current_scenario.append(line)
        if current_scenario:
            scenarios.append("\n".join(current_scenario))
    return scenarios

scenarios = parsed_feature()

In [22]:
def calculate_tfidf(steps):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(steps).toarray()
    feature_names = vectorizer.get_feature_names_out()
    step_index = {step: index for index, step in enumerate(steps)}
    return tfidf_matrix, feature_names, step_index

def semantic_similarity_with_tfidf(step1, step2, model, tfidf_matrix, feature_names, step_index):
    doc1 = nlp(step1)
    doc2 = nlp(step2)

    # print(doc1)
    # print(doc2)
    # print('+++++++++++++++++')

    vec1 = np.zeros(model.vector_size)
    # print(vec1)
    vec2 = np.zeros(model.vector_size)
    # print(vec1)
    # print('-------------------')

    for token in doc1:
        if token.text in model and token.text in feature_names:
            token_index = feature_names.tolist().index(token.text)
            tfidf_value = tfidf_matrix[step_index[step1], token_index]
            vec1 += model[token.text] * tfidf_value

    for token in doc2:
        if token.text in model and token.text in feature_names:
            token_index = feature_names.tolist().index(token.text)
            tfidf_value = tfidf_matrix[step_index[step2], token_index]
            vec2 += model[token.text] * tfidf_value

    vec1 = vec1 / np.linalg.norm(vec1)
    vec2 = vec2 / np.linalg.norm(vec2)

    similarity = np.dot(vec1, vec2)
    return similarity

def is_third_person(scenario):
  doc = nlp(scenario)
  for token in doc:
    if token.pos_ == 'PRON' and token.text.lower() in ['i', 'we', 'you', 'me', 'us']:
      return False
  return True

def has_and_on_same_line(scenario):
  steps = scenario.strip().split('\n')
  and_count = 0
  for step in steps:
    and_count = step.lower().split().count('and')
    if not step.lower().startswith('and'):
      and_count += 1
    if and_count > 1:
      return True
  return False

def has_correct_sequence(scenario):
    sequence = ['Given', 'When', 'Then']
    lines = scenario.strip().split('\n')
    steps = [line.split()[0] for line in lines if line.strip() and not line.strip().startswith('Scenario')]
    current_step = 0
    for step in steps:
        if step == sequence[current_step]:
            current_step += 1
            if current_step == len(sequence):
                break
        elif step != 'And':
            return False
    return current_step == len(sequence)

def validate_semantics_with_tfidf_word2vec(steps, model, tfidf_matrix, feature_names, step_index):
    for i in range(1, len(steps)):
        similarity = semantic_similarity_with_tfidf(steps[i-1], steps[i], model, tfidf_matrix, feature_names, step_index)
        if similarity < 0.3:
            return f"Low semantic similarity between steps:\n  '{steps[i-1]}'\n  '{steps[i]}'"
    return None


def identify_dissonant_words(step, model, tfidf_vector, feature_names):
  dissonant_words = []

  # Analisar cada palavra no step
  doc = nlp(step)

  for token in doc:
    if token.text in model and token.text in feature_names:
      token_index = feature_names.tolist().index(token.text)
      tfidf_value = tfidf_vector[token_index]
      # Verificar se a palavra está próxima de zero no embedding
      if np.linalg.norm(model[token.text]) == 0 or tfidf_value < 0.1:
        dissonant_words.append(token.text)

  return dissonant_words

In [31]:
def validate_scenario(scenario):
    lines = scenario.strip().split('\n')
    title = lines[0]  # e.g., "Scenario: Add two negative numbers"
    steps = lines[1:]

    tfidf_matrix, feature_names, step_index = calculate_tfidf(steps)
    semantic_error = validate_semantics_with_tfidf_word2vec(
        steps, word2vec_model, tfidf_matrix, feature_names, step_index
    )

    output_lines = [f"  {title}"]
    
    for i, step in enumerate(steps):
        errors = []

        # Check if step is not in third person
        if not is_third_person(step):
            errors.append("Step not in third person")

        # Check if 'And' is misused in the line
        if has_and_on_same_line(step):
            errors.append("'And' misused in the same line")

        # Check semantic similarity to previous step (except for the first)
        if i > 0:
            sim = semantic_similarity_with_tfidf(steps[i-1], step, word2vec_model, tfidf_matrix, feature_names, step_index)
            if sim < 0.3:
                errors.append("Low semantic similarity with previous step")

        if errors:
            formatted_errors = "     ->  (" + "; ".join(errors) + ")"
        else:
            formatted_errors = ""

        output_lines.append(f"    {step}{formatted_errors}")

    # Final check: sequence of Given → When → Then
    if not has_correct_sequence(scenario):
        output_lines.append("    ⚠ Error: Incorrect step sequence. Must be 'Given -> When -> Then'.")

    return "\n".join(output_lines)


In [16]:
scenarios = parsed_feature()
print(scenarios)
print(scenarios[3])

['Scenario: Add two positive numbers\nGiven the calculator is turned on\nAnd the number 15 is entered and the addition operator is selected\nAnd the number 10 is entered\nWhen the equals button is pressed\nThen the result should be 25\n', 'Scenario: Add two negative numbers\nGiven the calculator is turned on\nAnd I enter the number -5\nAnd the addition operator is selected\nAnd you enter number -3\nWhen the equals button is pressed\nThen we can see result -8\n', 'Scenario: Subtract a smaller number from a larger number\nGiven the calculator is turned on\nThen the number 20 is entered\nAnd the subtraction operator is selected\nAnd the number 5 is entered\nWhen the equals button is pressed\nThen the result should be 15\n', 'Scenario: Subtract a larger number from a smaller number\nGiven the calculator is turned on\nAnd the number 5 is entered\nAnd the subtraction operator is selected\nAnd the number 20 is entered\nWhen the monkey eats banana\nThen the result should be -15\n', 'Scenario: 

In [17]:
# Erro Multiplos passos aninhados
print(validate_scenario(scenarios[0]))

  Scenario: Add two positive numbers
    Given the calculator is turned on
    And the number 15 is entered and the addition operator is selected ('And' misused in the same line)
    And the number 10 is entered
    When the equals button is pressed
    Then the result should be 25


In [18]:
# Erro Primeira pessoa
print(validate_scenario(scenarios[1]))

  Scenario: Add two negative numbers
    Given the calculator is turned on
    And I enter the number -5 (Step not in third person; Low semantic similarity with previous step)
    And the addition operator is selected
    And you enter number -3 (Step not in third person)
    When the equals button is pressed
    Then we can see result -8 (Step not in third person)


In [19]:
# Erro de sequênia lógica
print(validate_scenario(scenarios[2]))

  Scenario: Subtract a smaller number from a larger number
    Given the calculator is turned on
    Then the number 20 is entered
    And the subtraction operator is selected
    And the number 5 is entered
    When the equals button is pressed
    Then the result should be 15
    ⚠ Error: Incorrect step sequence. Must be 'Given -> When -> Then'.


In [20]:
# Erro de semântica
print(validate_scenario(scenarios[3]))

  Scenario: Subtract a larger number from a smaller number
    Given the calculator is turned on
    And the number 5 is entered
    And the subtraction operator is selected
    And the number 20 is entered
    When the monkey eats banana (Low semantic similarity with previous step)
    Then the result should be -15 (Low semantic similarity with previous step)


In [32]:
# Erro de semântica
print(validate_scenario(scenarios[5]))

  Scenario: Add invalidly described values
    Given I turn the calculator on and I want to do math and itâ€™s broken     ->  (Step not in third person; 'And' misused in the same line)
    Then you click something and then we pray     ->  (Step not in third person; 'And' misused in the same line)
    When we see the screen and we type stuff     ->  (Step not in third person; 'And' misused in the same line)
    Given the monkey eats banana     ->  (Low semantic similarity with previous step)
    ⚠ Error: Incorrect step sequence. Must be 'Given -> When -> Then'.
