In [None]:
from markdown import markdown

with open('README.md', 'r') as file:
    contenido_md = file.read()

In [None]:
contenido_md

"# Team API â€“ ACOPS\n\nA Team API is a description and specification that ACOPS team has defined that tells others how to interact with that team.\n\n## Team API\n\nDate: 01-06-2024\n\n* Team name and focus: AC-OPS (Automation Community - Operations).\n* Team type: Operations.\n* Part of a Platform? (y/n) Details: The ACOPS team will help the Strategic and Technical teams to more successfully achieve their roadmaps and objectives.\n* Do we provide a service to other teams? Yes Details: Providing technical support, incident resolution, and ensuring system functionality.\n* What kind of Service Level Expectations do other teams have of us? Collaboration to achieve their objetives.\n* Software owned and evolved by this team:\n* Versioning approaches:\n* Wiki search terms: AC-OPS, Automation Operations, Automation Community Operations.\n* Chat tool channels: #ac-ops-internal #Microsoft Teams chat-.\n* Time of daily sync meeting: Daily Misa for 30 min, from 09:00 to 09:30 AM. Daily meetin

In [None]:
import nltk
from nltk import ne_chunk, pos_tag

def extract_person_entities(tokens):
    tagged_tokens = pos_tag(tokens)
    tree = ne_chunk(tagged_tokens)
    
    person_entities = []
    for subtree in tree:
        if isinstance(subtree, nltk.Tree) and subtree.label() == 'PERSON':
            entity = " ".join([token for token, pos in subtree.leaves()])
            person_entities.append(entity)

    non_person_tokens = [token for token, pos in tagged_tokens if token not in person_entities]

    return non_person_tokens

In [None]:
import re
import nltk
import spacy
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import CountVectorizer

# Descargar recursos necesarios
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Cargar modelos de spaCy para inglés y español
nlp_en = spacy.load("en_core_web_sm")
nlp_es = spacy.load("es_core_news_sm")

# Texto de ejemplo
text = contenido_md

# Eliminar palabras que estén completamente en mayúsculas
text = ' '.join(word for word in text.split() if not word.isupper())

# Eliminar dígitos
text = re.sub(r'\d+', '', text)

# Tokenizar el texto
tokens = nltk.word_tokenize(text)

# EXTRACCION ENTITIES 
tokens = extract_person_entities(tokens)

# Lista de idiomas y sus códigos para stopwords
stopwords_lang = {
    "spanish": set(stopwords.words("spanish")),
    "english": set(stopwords.words("english"))
}

# Eliminar stopwords para cada idioma
tokens = [token.lower() for token in tokens if token.isalnum()]
tokens = [token for token in tokens if token not in stopwords_lang["spanish"] and token not in stopwords_lang["english"]]

# Funciones de idioma usando WordNet
def is_english_word(word):
    synsets = wordnet.synsets(word)
    return len(synsets) > 0

def is_spanish_word(word):
    synsets = wordnet.synsets(word, lang='spa')
    return bool(synsets)

# Diccionario adicional de palabras comunes en inglés y español no reconocidas por WordNet
additional_english_words = set(['other', 'roadmaps', 'objetiv', 'without','recomended', 'test'])
additional_spanish_words = set(["facilitator", "si", "alto", "principal", "actual", "continuo", "favor", "idea", "junto", "toda"])

# Calcular probabilidades de pertenencia a cada idioma y clasificar palabras
def classify_words(tokens):
    english_words = set()
    spanish_words = set()

    for token in tokens:
        english = is_english_word(token) or token in additional_english_words
        spanish = is_spanish_word(token) or token in additional_spanish_words

        if english and not spanish:
            english_words.add(token)
        elif spanish and not english:
            spanish_words.add(token)
        else:
            # Si la palabra es ambigua, analiza el contexto usando N-Gramas
            context = get_context(tokens, token)
            if context:
                english_prob = calculate_ngram_probability(context, 'english')
                spanish_prob = calculate_ngram_probability(context, 'spanish')
                if english_prob > spanish_prob:
                    english_words.add(token)
                else:
                    spanish_words.add(token)

    return english_words, spanish_words

def get_context(tokens, target, window_size=2):
    index = tokens.index(target)
    start = max(index - window_size, 0)
    end = min(index + window_size + 1, len(tokens))
    context = tokens[start:end]
    return context

def calculate_ngram_probability(context, language):
    context_text = ' '.join(context)
    vectorizer = CountVectorizer(ngram_range=(1, 2), vocabulary=stopwords_lang[language])
    ngram_counts = vectorizer.fit_transform([context_text]).toarray()
    return ngram_counts.sum()

# Clasificar palabras
english_words, spanish_words = classify_words(tokens)

# Lematizar palabras clasificadas por idioma
english_lemmatized = [nlp_en(word)[0].lemma_ for word in english_words]
spanish_lemmatized = [nlp_es(word)[0].lemma_ for word in spanish_words]

# Mostrar los resultados
print("English Words:", english_lemmatized)
print("Spanish Words:", spanish_lemmatized)

# Calcular y mostrar los porcentajes
total_words = len(english_lemmatized) + len(spanish_lemmatized)
english_percentage = (len(english_lemmatized) / total_words) * 100 if total_words > 0 else 0
spanish_percentage = (len(spanish_lemmatized) / total_words) * 100 if total_words > 0 else 0

print(f"English: {english_percentage:.2f}%")
print(f"Spanish: {spanish_percentage:.2f}%")


English Words: ['service', 'closely', 'system', 'interact', 'approach', 'administrative', 'resolution', 'tell', 'own', 'platform', 'technical', 'achieve', 'way', 'hour', 'close', 'create', 'https', 'automation', 'help', 'service', 'soon', 'friday', 'organisational', 'meet', 'work', 'transformation', 'provide', 'community', 'detail', 'time', 'specification', 'collaborate', 'provide', 'team', 'sync', 'ensure', 'rule', 'continuous', 'incident', 'progress', 'response', 'mode', 'functionality', 'channel', 'min', 'term', 'purpose', 'daily', 'interaction', 'description', 'stability', 'failure', 'infrastructure', 'define', 'resolve', 'kind', 'support', 'tool', 'name', 'silence', 'expectation', 'yes', 'part', 'currently', 'solution', 'type', 'incident', 'information', 'duration', 'date', 'focus', 'strategic', 'objective', 'expect', 'wide', 'drive', 'request', 'automate', 'system', 'zone', 'we', 'level', 'operation', 'request', 'evolve', 'roadmap', 'monday', 'communication', 'reserve', 'search',