In [None]:
import pandas as pd
from collections import Counter, defaultdict
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import rdflib

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize NLTK tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load RDF data from XML file
g = rdflib.Graph()
g.parse("/content/gauri1.rdf", format="xml")

# Adjusting the SPARQL query
query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX school: <http://www.semanticweb.org/guptgau03/ontologies/2024/2/school/>

SELECT ?name ?description
WHERE {
    ?s rdf:type ?type .
    ?s school:name ?name .
    OPTIONAL { ?s school:description ?description . }
}
"""

results = g.query(query)

# Prepare the data
data = [{'Name': str(row.name), 'Description': str(row.description) if row.description else ''} for row in results]
df = pd.DataFrame(data)

# Print the DataFrame to verify the data is loaded correctly
print(df.head(10))

def preprocess_and_tokenize(text):
    # Convert to lowercase, remove punctuation, tokenize, remove stop words, and lemmatize
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return tokens

# Apply preprocessing and other processing steps as before
df['Processed'] = df['Description'].apply(preprocess_and_tokenize)

def score_terms(tokens):
    # Count term frequencies
    term_freq = Counter(tokens)
    # Sort terms by frequency
    sorted_terms = dict(sorted(term_freq.items(), key=lambda item: item[1], reverse=True))
    return sorted_terms

# Score and rank terms in descriptions
df['Ranked_Terms'] = df['Processed'].apply(score_terms)

def select_top_n_tags(ranked_terms, n=7):
    # Select the top N terms
    return list(ranked_terms.keys())[:n]

# Select top 7 tags for each description
df['Top_7_Tags'] = df['Ranked_Terms'].apply(lambda x: select_top_n_tags(x, 7))

# Map names to their top 7 tags
name_to_top_tags = pd.Series(df['Top_7_Tags'].values, index=df['Name']).to_dict()

# Initialize defaultdict to store counts and names for each tag
tag_to_overall_count_and_names = defaultdict(lambda: {"count": 0, "names": defaultdict(int)})

# Update counts and names for each tag
for name, tags in name_to_top_tags.items():
    for tag in tags:
        # Increase the overall count for the tag
        tag_to_overall_count_and_names[tag]["count"] += 1
        # Increase the count for this tag under this specific name
        tag_to_overall_count_and_names[tag]["names"][name] += 1

# Print results
for tag, info in tag_to_overall_count_and_names.items():
    names_counts = ', '.join([f"{name} - {count}" for name, count in info["names"].items()])
    print(f"{tag.capitalize()} - {names_counts}; Total - {info['count']}")
