In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm

In [2]:
import xml.etree.ElementTree as ET
from html import unescape

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
file_path = '/content/drive/My Drive/Colab Notebooks/dblp.xml'


In [4]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_dblp(xml_file):
    context = ET.iterparse(xml_file, events=('end',))
    _, root = next(context)
    records = []

    for event, elem in context:
        if elem.tag in {'article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis'}:
            record = {}
            for child in elem:
                if child.tag in record:
                    if isinstance(record[child.tag], list):
                        record[child.tag].append(child.text)
                    else:
                        record[child.tag] = [record[child.tag], child.text]
                else:
                    record[child.tag] = child.text
            records.append(record)
            elem.clear()
            root.clear()
    return pd.DataFrame(records)

# Parse the XML file
df = parse_dblp(file_path)


ParseError: undefined entity &Ouml;: line 90, column 17 (<string>)

In [None]:
with open(file_path, 'r', encoding='utf-8') as file:
    xml_content = file.read()

# Unescape HTML entities like &Ouml;, &amp;, etc.
xml_content = unescape(xml_content)

# Save the cleaned XML content back to a new file (this helps to avoid reading the same file again)
cleaned_file_path = 'cleaned_dblp.xml'
with open(cleaned_file_path, 'w', encoding='utf-8') as cleaned_file:
    cleaned_file.write(xml_content)

# Now use iterparse with the cleaned file to avoid memory issues
context = ET.iterparse(cleaned_file_path, events=('start', 'end'))
_, root = next(context)  # Grab the root element to start the parsing

papers = []

# Define tags of interest
valid_tags = ['article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis']

# Process the XML in a memory-efficient manner
for event, elem in context:
    if event == 'end' and elem.tag in valid_tags:
        # Extract relevant data
        entry_type = elem.tag
        title = elem.find('title').text if elem.find('title') is not None else ''
        authors = [author.text for author in elem.findall('.//author')]  # List of authors
        keywords = [keyword.text for keyword in elem.findall('.//keyword')]  # List of keywords
        citations = elem.find('citation').text if elem.find('citation') is not None else 0

        # Append data to the list
        papers.append({
            'type': entry_type,
            'title': title,
            'authors': authors,
            'keywords': keywords,
            'citations': citations
        })

        # Clear the element to free up memory
        root.clear()

# Convert to DataFrame
df = pd.DataFrame(papers)

# Print the first few rows to verify
print(df.head())

In [None]:
import xml.etree.ElementTree as ET
from html import unescape
import pandas as pd

# Define the path to your XML file
file_path = '/content/drive/My Drive/Colab Notebooks/dblp.xml'


# Function to unescape HTML entities in chunks (avoiding large memory usage)
def unescape_xml_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return unescape(content)

# Process the XML file in chunks
def parse_xml_in_chunks(file_path):
    context = ET.iterparse(file_path, events=('start', 'end'))
    _, root = next(context)  # Grab the root element to start the parsing

    papers = []

    valid_tags = ['article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis']

    # Processing in chunks
    for event, elem in context:
        if event == 'end' and elem.tag in valid_tags:
            # Extract relevant data
            entry_type = elem.tag
            title = elem.find('title').text if elem.find('title') is not None else ''
            authors = [author.text for author in elem.findall('.//author')]  # List of authors
            keywords = [keyword.text for keyword in elem.findall('.//keyword')]  # List of keywords
            citations = elem.find('citation').text if elem.find('citation') is not None else 0

            # Append data to the list
            papers.append({
                'type': entry_type,
                'title': title,
                'authors': authors,
                'keywords': keywords,
                'citations': citations
            })

            # Clear the element to free up memory
            root.clear()

        # To avoid too much memory usage, you could process in batches
        if len(papers) > 1000:  # Adjust this to your needs
            # Convert to DataFrame and process, then clear the data
            df = pd.DataFrame(papers)
            # Optionally, save or process the chunk here
            # For now, we clear the list to save memory
            papers.clear()

    # Final conversion of any remaining data
    if papers:
        df = pd.DataFrame(papers)
        # Optionally, save the final DataFrame
        papers.clear()

    return df

# Step 1: Unescape XML file content to handle special characters
xml_content = unescape_xml_file(file_path)

# Save the cleaned XML content to a temporary file
cleaned_file_path = 'cleaned_dblp.xml'
with open(cleaned_file_path, 'w', encoding='utf-8') as f:
    f.write(xml_content)

# Step 2: Parse the cleaned XML file in chunks
df = parse_xml_in_chunks(cleaned_file_path)

# Check the result
print(df.head())


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Combine titles and keywords for vectorization
df['text'] = df['title'] + " " + df['keywords'].apply(lambda x: ' '.join(x))

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])

# Normalizing the data
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X.toarray())


In [None]:
from sklearn.metrics import pairwise_distances

# Compute the Euclidean distance matrix
distance_matrix = pairwise_distances(X_scaled, metric='euclidean')


In [None]:
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

# Perform Agglomerative Clustering
Z = sch.linkage(distance_matrix, method='ward')  # Using 'ward' minimizes the variance of clusters

# Plot the Dendrogram
plt.figure(figsize=(10, 7))
sch.dendrogram(Z)
plt.title('Dendrogram')
plt.xlabel('Paper Index')
plt.ylabel('Euclidean Distance')
plt.show()


In [None]:
from scipy.cluster.hierarchy import fcluster

# Cutting the dendrogram to form flat clusters
max_d = 50  # This is a distance threshold; you can change it based on your dendrogram
clusters = fcluster(Z, max_d, criterion='distance')

# Add cluster labels to the dataframe
df['cluster'] = clusters


In [None]:
import lxml.etree as ET

# Parse using lxml
tree = ET.parse(file_path)
root = tree.getroot()


In [None]:
from sklearn.decomposition import PCA

# Perform PCA for dimensionality reduction to 2D
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X_scaled)

# Scatter plot of the clusters
plt.figure(figsize=(10, 7))
plt.scatter(pca_components[:, 0], pca_components[:, 1], c=df['cluster'], cmap='rainbow')
plt.title('Cluster Visualization (PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()


In [None]:
# Analyze clusters
for cluster_num in df['cluster'].unique():
    cluster_papers = df[df['cluster'] == cluster_num]
    print(f"Cluster {cluster_num} contains {len(cluster_papers)} papers")
    print(f"Most common keywords in Cluster {cluster_num}:")
    all_keywords = ' '.join(cluster_papers['keywords'].apply(lambda x: ' '.join(x)))
    print(pd.Series(all_keywords.split()).value_counts().head(10))
    print("\n")


In [None]:
drive.flush_and_unmount()

In [None]:
https://www.kaggle.com/datasets/dheerajmpai/dblp2023 ,this is the dataset and  ask-2. Hierarchical Clustering

Concept:

Hierarchical clustering creates a tree-like structure (dendrogram) representing nested clusters.

It does not require predefining K.

Two types:

Agglomerative (Bottom-Up): Each data point starts as its own cluster, and clusters are merged iteratively.

Divisive (Top-Down): Starts with one large cluster, which is split recursively.

Steps:

Normalize the dataset.

Compute the distance matrix using Euclidean distance.

Apply Agglomerative Clustering (most common method).

Use the Dendrogram to determine the optimal number of clusters.

Visualize your analysis before and after clustering
Analyze and interpret cluster characteristics.
For example : Network Feature Extraction: Construct adjacency matrices from:

Citation relationships
Author-venue affiliations
Paper keyword co-occurrence
Cluster Interpretation Framework
Provide domain-specific analysis prompts: "Identify emerging CS subfields through cluster evolution 2010-2025" "Map cluster hierarchies to ACM Computing Classification System" "Analyze Nobel laureate collaboration patterns through dendrogram cuts"

Dataset for Hierarchical Clustering- The DBLP Computer Science Bibliography Dataset


[ ]
1  this is te task .. do sth

In [None]:
import pandas as pd

# Example: Extracting paper titles, authors, keywords, and citations
papers = []
for entry in root.findall(".//article"):  # Adjust this path based on the XML structure
    title = entry.find('title').text if entry.find('title') is not None else ''
    authors = [author.text for author in entry.findall('.//author')]  # List of authors
    keywords = [keyword.text for keyword in entry.findall('.//keyword')]  # List of keywords
    citations = entry.find('citation').text if entry.find('citation') is not None else 0
    papers.append({'title': title, 'authors': authors, 'keywords': keywords, 'citations': citations})

# Convert to DataFrame
df = pd.DataFrame(papers)
print(df.head())


In [None]:
# # Load the XML file
# tree = ET.parse(file_path)  # Replace with the path to your dataset file
# root = tree.getroot()

# # Inspect the structure of the XML
# print(root.tag, root.attrib)

# import xml.etree.ElementTree as ET
# from html import unescape  # Import unescape to handle special characters

# # Read the XML file and unescape entities
# with open(file_path, 'r', encoding='utf-8') as file:
#     xml_content = file.read()

# # Unescape HTML entities (like &Ouml;)
# xml_content = unescape(xml_content)

# # Now parse the cleaned XML content
# root = ET.fromstring(xml_content)

# # Proceed with your existing parsing code
# context = ET.iterparse(file_path, events=('start', 'end'))
# _, root = next(context)

# # Continue with your logic...


In [None]:
# Path to the XML file (update with your path)
file_path = "dblp.xml"

# Parse the XML
tree = ET.parse(file_path)
root = tree.getroot()

# Extract data
entries = []
for elem in root.findall('article'):
    title = elem.find('title').text if elem.find('title') is not None else None
    year = elem.find('year').text if elem.find('year') is not None else None
    journal = elem.find('journal').text if elem.find('journal') is not None else None
    authors = [author.text for author in elem.findall('author')]

    entries.append({
        'title': title,
        'year': year,
        'journal': journal,
        'authors': ", ".join(authors)
    })

# Convert to DataFrame
df = pd.DataFrame(entries)

# Drop entries with missing data
df.dropna(inplace=True)

# Preview
print(df.head())


In [None]:
# Define tags of interestc
valid_tags = ['article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis']

# Parse in an efficient way
context = ET.iterparse(file_path, events=('start', 'end'))
_, root = next(context)  # grab the root element

# For storing parsed data
data = []

print("Parsing XML (this may take a few minutes)...")

for event, elem in tqdm(context):
    if event == 'end' and elem.tag in valid_tags:
        entry_type = elem.tag
        authors = [a.text for a in elem.findall('author')]
        year_elem = elem.find('year')
        venue_elem = elem.find('journal') or elem.find('booktitle')

        if authors and year_elem is not None:
            year = year_elem.text
            venue = venue_elem.text if venue_elem is not None else 'Unknown'
            data.append({
                'type': entry_type,
                'author_count': len(authors),
                'year': int(year),
                'venue': venue
            })

        # Clear the element to save memory
        root.clear()

# Convert to DataFrame
df = pd.DataFrame(data)
print("Parsed entries:", len(df))
df.head()


Parsing XML (this may take a few minutes)...


166it [00:00, 81624.20it/s]


ParseError: undefined entity &Ouml;: line 90, column 17 (<string>)