In [None]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import words

# === Downloads ===
nltk.download('punkt')
nltk.download('words')

# === Common Folder Path ===
FOLDER_PATH = "/content/data/txt"

# -------------------------------
#  PART 1: Missing Abbreviation Definitions
# -------------------------------

OUTPUT_CSV_MISSING = "missing_abbreviation_definitions.csv"

ABBREV_PATTERN = re.compile(r"\b[A-Z]{2,6}\b")  # abbreviation length >=2 and <=6
DEFINED_ABBREV_PATTERN = re.compile(r"\b([A-Za-z][A-Za-z\s\-]+)\s*\(([A-Z]{2,6})\)")
DEFINED_REVERSE_PATTERN = re.compile(r"\b([A-Z]{2,6})\s*\(([A-Za-z][A-Za-z\s\-]+)\)")

COMMON_ABBREVS = {"UI", "API", "DB", "RAM", "CPU", "SQL", "HTTP", "HTTPS",
                  "ID", "OS", "URL", "JSON", "XML", "GUI", "IP", "FTP", "DNS"}

REAL_WORDS = set(w.lower() for w in words.words())

def preprocess_text_missing(text):
    """Clean and normalize SRS text before NLP processing."""
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    text = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text)
    text = text.replace("\r", " ").replace("\n", " ").replace("\t", " ")
    text = re.sub(r"\bPage\s*\d+(\s*of\s*\d+)?\b", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"\bConfidential\b", " ", text, flags=re.IGNORECASE)
    text = re.sub(r'^[A-Z\s\d\.\-]{3,}$', ' ', text, flags=re.MULTILINE)
    text = re.sub(r"\d+(\.\d+)*\s+[A-Z][A-Z\s]+", " ", text)
    srs_junk = [
        "figure", "table", "document", "revision", "purpose",
        "scope", "requirement", "module", "version", "description"
    ]
    for word in srs_junk:
        text = re.sub(rf"\b{word}\b", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

rows_missing = []

for filename in os.listdir(FOLDER_PATH):
    if not filename.endswith(".txt"):
        continue

    filepath = os.path.join(FOLDER_PATH, filename)
    with open(filepath, "r", encoding="utf-8", errors="ignore") as file:
        raw_text = file.readlines()

    text = "".join(raw_text[50:])  # Skip metadata
    text = preprocess_text_missing(text)
    sentences = sent_tokenize(text)

    defined_abbrevs = set()
    for full, abbr in re.findall(DEFINED_ABBREV_PATTERN, text):
        defined_abbrevs.add(abbr.strip())
    for abbr, full in re.findall(DEFINED_REVERSE_PATTERN, text):
        defined_abbrevs.add(abbr.strip())

    all_abbrevs = set(re.findall(ABBREV_PATTERN, text))
    missing_abbrevs = (all_abbrevs - defined_abbrevs) - COMMON_ABBREVS
    missing_abbrevs = {abbr for abbr in missing_abbrevs if abbr.lower() not in REAL_WORDS}

    found_abbrevs = set()

    for sent in sentences:
        letters = re.findall(r'[A-Za-z]', sent)
        if letters and sum(1 for c in letters if c.isupper()) / len(letters) > 0.7:
            continue
        if len(sent.split()) < 5:
            continue
        if len(re.findall(r'\b[A-Z]{2,6}\b', sent)) >= 5:
            continue
        if re.search(r'\b(19|20)\d{2}\b', sent) or re.search(
            r'\d{1,2}[-/](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)',
            sent, re.IGNORECASE):
            continue
        if re.findall(r'[A-Z]\.\s*[A-Z][a-z]+', sent):
            continue

        for abbr in list(missing_abbrevs):
            if abbr not in found_abbrevs and re.search(rf"\b{abbr}\b", sent):
                rows_missing.append({
                    "File": filename,
                    "Abbreviation": abbr,
                    "Type of Bad Smell": "Missing Abbreviation Definition",
                    "Sentence": sent.strip()
                })
                found_abbrevs.add(abbr)

df_missing = pd.DataFrame(rows_missing)
df_missing.to_csv(OUTPUT_CSV_MISSING, index=False)
print(f"‚úÖ Done! Found {len(df_missing)} missing abbreviation definitions.")
print(f"üìÑ Output saved to {OUTPUT_CSV_MISSING}")

# -------------------------------
#  PART 2: Inconsistent Abbreviation Usage
# -------------------------------

OUTPUT_CSV_INCONSISTENT = "abbreviation_inconsistencies.csv"

DEFINED_ABBREV_PATTERN_INC = re.compile(r"\b([A-Za-z][A-Za-z\s\-]+?)\s*\(([A-Z]{2,6})\)")
DEFINED_REVERSE_PATTERN_INC = re.compile(r"\b([A-Z]{2,6})\s*\(([A-Za-z][A-Za-z\s\-]+?)\)")

BAD_STARTERS = {"for", "in", "on", "at", "by", "to", "from", "and", "or", "if", "with", "this", "that", "these", "those"}

def preprocess_text_inconsistency(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    text = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text)
    text = text.replace("\r", " ").replace("\n", " ").replace("\t", " ")
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

def is_valid_full_form(full):
    words_ = full.strip().split()
    if len(words_) < 2 or len(words_) > 6:
        return False
    if words_[0].lower() in BAD_STARTERS:
        return False
    if not any(w[0].isupper() for w in words_ if w.isalpha()):
        return False
    return True

rows_inconsistency = []
term_to_abbrs = {}

for filename in os.listdir(FOLDER_PATH):
    if not filename.endswith(".txt"):
        continue

    filepath = os.path.join(FOLDER_PATH, filename)
    with open(filepath, "r", encoding="utf-8", errors="ignore") as file:
        raw_text = file.read()

    text = preprocess_text_inconsistency(raw_text)
    sentences = sent_tokenize(text)

    for full, abbr in re.findall(DEFINED_ABBREV_PATTERN_INC, text):
        full_clean = full.strip()
        abbr_clean = abbr.strip()
        if len(abbr_clean) >= 2 and len(abbr_clean) <= 6 and is_valid_full_form(full_clean):
            term_to_abbrs.setdefault(full_clean, set()).add(abbr_clean)

    for abbr, full in re.findall(DEFINED_REVERSE_PATTERN_INC, text):
        full_clean = full.strip()
        abbr_clean = abbr.strip()
        if len(abbr_clean) >= 2 and len(abbr_clean) <= 6 and is_valid_full_form(full_clean):
            term_to_abbrs.setdefault(full_clean, set()).add(abbr_clean)

for term, abbrs in term_to_abbrs.items():
    if len(abbrs) > 1:
        rows_inconsistency.append({
            "Full Form": term,
            "Abbreviations": ", ".join(sorted(list(abbrs))),
            "Type of Bad Smell": "Inconsistent Abbreviation Usage"
        })

df_inconsistency = pd.DataFrame(rows_inconsistency)
df_inconsistency.to_csv(OUTPUT_CSV_INCONSISTENT, index=False)
print(f"‚úÖ Done! Found {len(df_inconsistency)} inconsistent abbreviation definitions.")
print(f"üìÑ Output saved to {OUTPUT_CSV_INCONSISTENT}")


In [None]:
# ========================================
#  PART 1B: Get Expected Abbreviation Definitions using Bard AI
# ========================================
from bardapi import Bard
import time

# Load Bard API Key (required for bardapi)
os.environ['_BARD_API_KEY'] = "YOUR_BARD_API_KEY_HERE"  # replace this with your actual key

# Reuse the missing abbreviations DataFrame
if not df_missing.empty:
    print(" Querying Bard AI for expected meanings of abbreviations...")

    bard = Bard()  # Initialize Bard client
    suggested_defs = []

    for i, row in df_missing.iterrows():
        abbr = row["Abbreviation"]
        prompt = f"What is the most common full form of the abbreviation '{abbr}' in software or technical context?"
        try:
            response = bard.get_answer(prompt)
            meaning = response.get("content", "").strip()
            # Clean possible Bard extra text
            meaning = meaning.split("\n")[0].replace("**", "").replace("*", "")
        except Exception as e:
            meaning = f"Error: {str(e)}"

        suggested_defs.append(meaning)
        print(f"‚úÖ {abbr} ‚Üí {meaning}")
        time.sleep(2)  # delay to avoid hitting request limits

    # Add new column
    df_missing["Suggested Definition"] = suggested_defs

    # Save updated CSV
    df_missing.to_csv("missing_abbreviation_definitions_with_bard.csv", index=False)
    print("\nüìÑ Output with Bard suggestions saved to 'missing_abbreviation_definitions_with_bard.csv'")
else:
    print("‚ö†Ô∏è No missing abbreviations found ‚Äî skipping Bard lookup.")
