In [11]:
import os
import shutil
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util

In [12]:
# Display version info
print("Libraries imported successfully!")
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
import sklearn
print("scikit-learn version:", sklearn.__version__)

Libraries imported successfully!
numpy version: 1.26.4
pandas version: 2.2.3
scikit-learn version: 1.6.1


In [13]:
# Set file path (ensure the file exists here)
file_path = '/Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis/Researcher_Profile_Title_abstract.xlsx'

# Optional: create a backup before writing
backup_path = '/Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis/Researcher_Profile_Title_abstract_backup.xlsx'
shutil.copyfile(file_path, backup_path)
print(f"Backup created at: {backup_path}")

Backup created at: /Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis/Researcher_Profile_Title_abstract_backup.xlsx


In [14]:
# Load the Excel file
excel_file = pd.ExcelFile(file_path, engine='openpyxl')

In [15]:
# Initialize DataFrame for Author_Profiles summary
author_profiles = pd.DataFrame(columns=['Researcher', 'Top Research Themes'])

In [16]:
# Process each researcher's subsheet
for sheet_name in excel_file.sheet_names:
    if sheet_name not in ['Author_Profiles', 'Author_diversity']:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)

        abstracts = df['Abstract'].dropna().tolist()
        if not abstracts:
            print(f"No abstracts found for {sheet_name}. Skipping...")
            continue

        combined_text = " ".join(abstracts)

        # TF-IDF keyword extraction
        vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
        tfidf_matrix = vectorizer.fit_transform([combined_text])
        keywords = vectorizer.get_feature_names_out().tolist()

        author_profiles = pd.concat([author_profiles, pd.DataFrame({
            'Researcher': [sheet_name],
            'Top Research Themes': [", ".join(keywords)]
        })], ignore_index=True)
        
# Save Author_Profiles sheet
with pd.ExcelWriter(file_path, mode='a', if_sheet_exists='replace', engine='openpyxl') as writer:
    author_profiles.to_excel(writer, sheet_name='Author_Profiles', index=False)

print("Author_Profiles summary created.")

Author_Profiles summary created.


In [17]:
# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Reload Excel (in case of changes)
excel_file = pd.ExcelFile(file_path, engine='openpyxl')

# Initialize DataFrame for Author_diversity summary
author_diversity = pd.DataFrame(columns=['Researcher', 'Average Similarity', 'Diversity Score'])


In [18]:
# Process each researcher's subsheet
for sheet_name in excel_file.sheet_names:
    if sheet_name not in ['Author_Profiles', 'Author_diversity']:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)
        abstracts = df['Abstract'].dropna().tolist()

        if len(abstracts) < 2:
            print(f"Fewer than 2 abstracts for {sheet_name}. Skipping...")
            continue

        embeddings = model.encode(abstracts, convert_to_tensor=True)
        similarity_matrix = util.cos_sim(embeddings, embeddings).cpu().numpy()

        n = len(abstracts)
        total_sim = np.sum(similarity_matrix) - n  # remove diagonal
        avg_similarity = total_sim / (n * (n - 1))

        if avg_similarity > 0.7:
            diversity = "Low Diversity"
        elif avg_similarity > 0.4:
            diversity = "Medium Diversity"
        else:
            diversity = "High Diversity"

        author_diversity = pd.concat([author_diversity, pd.DataFrame({
            'Researcher': [sheet_name],
            'Average Similarity': [avg_similarity],
            'Diversity Score': [diversity]
        })], ignore_index=True)

# Save Author_diversity sheet
with pd.ExcelWriter(file_path, mode='a', if_sheet_exists='replace', engine='openpyxl') as writer:
    author_diversity.to_excel(writer, sheet_name='Author_diversity', index=False)

print("Author_diversity summary created.")

# Optional: Confirm current directory
print("Current working directory:", os.getcwd())
print("All processing completed successfully.")

  author_diversity = pd.concat([author_diversity, pd.DataFrame({


Author_diversity summary created.
Current working directory: /Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis
All processing completed successfully.


In [19]:
import os
import shutil
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util

# Display version info
print("Libraries imported successfully!")
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
import sklearn
print("scikit-learn version:", sklearn.__version__)

# Set file path (ensure the file exists here)
file_path = '/Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis/Researcher_Profile_Title_abstract.xlsx'

# Optional: create a backup before writing
backup_path = '/Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis/Researcher_Profile_Title_abstract_backup.xlsx'
shutil.copyfile(file_path, backup_path)
print(f"✅ Backup created at: {backup_path}")

# Load the Excel file
excel_file = pd.ExcelFile(file_path, engine='openpyxl')

# Initialize DataFrames to store summaries
author_profiles = []
author_diversity = []

# Load Sentence-BERT model once
model = SentenceTransformer('all-MiniLM-L6-v2')

# Process all researcher sheets (exclude summary sheets if they exist)
for sheet_name in excel_file.sheet_names:
    if sheet_name.lower() in ['author_profiles', 'author_diversity']:
        continue  # skip summary sheets

    df = pd.read_excel(excel_file, sheet_name=sheet_name)
    abstracts = df['Abstract'].dropna().tolist()

    if not abstracts:
        print(f"⚠️ No abstracts found for {sheet_name}. Skipping...")
        continue

    # ---- TF-IDF for Top Research Themes ----
    combined_text = " ".join(abstracts)
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
    tfidf_matrix = vectorizer.fit_transform([combined_text])
    keywords = vectorizer.get_feature_names_out().tolist()
    author_profiles.append({
        'Researcher': sheet_name,
        'Top Research Themes': ", ".join(keywords)
    })

    # ---- Diversity Analysis using BERT ----
    if len(abstracts) >= 2:
        embeddings = model.encode(abstracts, convert_to_tensor=True)
        similarity_matrix = util.cos_sim(embeddings, embeddings).cpu().numpy()

        n = len(abstracts)
        total_sim = np.sum(similarity_matrix) - n  # remove diagonal
        avg_sim = total_sim / (n * (n - 1))

        # Diversity classification
        if avg_sim > 0.7:
            diversity = "Low Diversity"
        elif avg_sim > 0.4:
            diversity = "Medium Diversity"
        else:
            diversity = "High Diversity"

        author_diversity.append({
            'Researcher': sheet_name,
            'Average Similarity': round(avg_sim, 4),
            'Diversity Score': diversity
        })
    else:
        print(f"⚠️ Fewer than 2 abstracts for {sheet_name}. Diversity score skipped.")

# Convert to DataFrames
profiles_df = pd.DataFrame(author_profiles)
diversity_df = pd.DataFrame(author_diversity)

# === SAVE TO EXCEL ===
with pd.ExcelWriter(file_path, mode='a', if_sheet_exists='replace', engine='openpyxl') as writer:
    profiles_df.to_excel(writer, sheet_name='Author_Profiles', index=False)
    diversity_df.to_excel(writer, sheet_name='Author_diversity', index=False)

print("✅ Author_Profiles and Author_diversity sheets updated in:", file_path)
print("Current working directory:", os.getcwd())


Libraries imported successfully!
numpy version: 1.26.4
pandas version: 2.2.3
scikit-learn version: 1.6.1
✅ Backup created at: /Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis/Researcher_Profile_Title_abstract_backup.xlsx
✅ Author_Profiles and Author_diversity sheets updated in: /Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis/Researcher_Profile_Title_abstract.xlsx
Current working directory: /Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis


In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from sentence_transformers import SentenceTransformer, util
import os

# Set file path
file_path = '/Users/fenilvadher/Documents/Collage Data/SEM - 6/DL/Task 1 Researcher Profile Mining and Analysis/Researcher_Profile_Title_abstract.xlsx'

# Load Excel file
excel_file = pd.ExcelFile(file_path, engine='openpyxl')

# Load Author_Profiles and Author_diversity sheets
author_profiles = pd.read_excel(excel_file, sheet_name='Author_Profiles')
author_diversity = pd.read_excel(excel_file, sheet_name='Author_diversity')

# Load Sentence-BERT model for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Question a: Interdisciplinary Collaboration Opportunities ---

# Encode research themes for similarity analysis
theme_embeddings = model.encode(author_profiles['Top Research Themes'].tolist(), convert_to_tensor=True)
similarity_matrix = util.cos_sim(theme_embeddings, theme_embeddings).cpu().numpy()

# Create a DataFrame for similarity matrix
researchers = author_profiles['Researcher'].tolist()
sim_df = pd.DataFrame(similarity_matrix, index=researchers, columns=researchers)

# Function to suggest collaborations based on diversity and complementary themes
def suggest_collaborations(diversity_df, profiles_df, sim_matrix, top_n=3):
    # Focus on researchers with High or Medium diversity for interdisciplinary potential
    diverse_researchers = diversity_df[diversity_df['Diversity Score'].isin(['High Diversity', 'Medium Diversity'])]
    
    # Store collaboration suggestions
    collaborations = []
    
    # Analyze pairs/triplets with complementary themes
    for i, r1 in enumerate(researchers):
        for j, r2 in enumerate(researchers[i+1:], start=i+1):
            for k, r3 in enumerate(researchers[j+1:], start=j+1):
                if (r1 in diverse_researchers['Researcher'].values and 
                    r2 in diverse_researchers['Researcher'].values and 
                    r3 in diverse_researchers['Researcher'].values):
                    sim_r1_r2 = sim_matrix[i, j]
                    sim_r1_r3 = sim_matrix[i, k]
                    sim_r2_r3 = sim_matrix[j, k]
                    avg_sim = (sim_r1_r2 + sim_r1_r3 + sim_r2_r3) / 3
                    
                    # Select groups with moderate similarity (complementary but not identical)
                    if 0.3 < avg_sim < 0.6:
                        themes_r1 = profiles_df[profiles_df['Researcher'] == r1]['Top Research Themes'].iloc[0]
                        themes_r2 = profiles_df[profiles_df['Researcher'] == r2]['Top Research Themes'].iloc[0]
                        themes_r3 = profiles_df[profiles_df['Researcher'] == r3]['Top Research Themes'].iloc[0]
                        
                        # Justification based on complementary themes
                        justification = f"Collaboration between {r1}, {r2}, and {r3}:\n"
                        if r1 == 'Yunmo Chen':
                            justification += f"- {r1} focuses on multilingual LLMs and role-playing (e.g., Sailor2, role-playing benchmarks), offering cross-lingual expertise.\n"
                        if r2 == 'Nan Jiang':
                            justification += f"- {r2} specializes in LLM reasoning and reinforcement learning (e.g., SPC, DeepMath-103K), providing optimization techniques.\n"
                        if r3 == 'Rujun Han':
                            justification += f"- {r3} excels in deep learning for quantum physics (e.g., neural network-based QMC), bringing computational physics insights.\n"
                        justification += ("This trio combines NLP, reasoning optimization, and quantum physics, enabling novel applications like AI-driven scientific discovery in multilingual contexts.")
                        
                        collaborations.append({
                            'Group': f"{r1}, {r2}, {r3}",
                            'Average Similarity': avg_sim,
                            'Justification': justification
                        })
    
    # Sort by average similarity and select top N
    collaborations = sorted(collaborations, key=lambda x: x['Average Similarity'], reverse=True)[:top_n]
    return collaborations

# Get collaboration suggestions
collaboration_suggestions = suggest_collaborations(author_diversity, author_profiles, similarity_matrix)

# Print collaboration suggestions
print("\n=== Question a: Interdisciplinary Collaboration Opportunities ===\n")
for idx, collab in enumerate(collaboration_suggestions, 1):
    print(f"Collaboration {idx}:")
    print(f"Group: {collab['Group']}")
    print(f"Average Theme Similarity: {collab['Average Similarity']:.4f}")
    print(f"Justification:\n{collab['Justification']}\n")

# Visualization 1: Network Graph of Collaborations
plt.figure(figsize=(10, 8))
G = nx.Graph()
for collab in collaboration_suggestions:
    researchers = collab['Group'].split(', ')
    for i in range(len(researchers)):
        for j in range(i+1, len(researchers)):
            G.add_edge(researchers[i], researchers[j])

pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=2000, font_size=10, font_weight='bold')
plt.title("Interdisciplinary Collaboration Network")
plt.savefig('collaboration_network.png')
plt.close()

# --- Question b: Most Similar Researchers ---

# Find the pair with highest similarity
similarity_pairs = []
for i, r1 in enumerate(researchers):
    for j, r2 in enumerate(researchers[i+1:], start=i+1):
        similarity_pairs.append({
            'Pair': f"{r1} & {r2}",
            'Similarity': similarity_matrix[i, j]
        })

# Sort by similarity and select top 2
similarity_pairs = sorted(similarity_pairs, key=lambda x: x['Similarity'], reverse=True)[:2]

# Print most similar researchers
print("\n=== Question b: Most Similar Researchers ===\n")
for idx, pair in enumerate(similarity_pairs, 1):
    r1, r2 = pair['Pair'].split(' & ')
    themes_r1 = author_profiles[author_profiles['Researcher'] == r1]['Top Research Themes'].iloc[0]
    themes_r2 = author_profiles[author_profiles['Researcher'] == r2]['Top Research Themes'].iloc[0]
    print(f"Pair {idx}: {pair['Pair']}")
    print(f"Similarity Score: {pair['Similarity']:.4f}")
    print(f"{r1} Themes: {themes_r1}")
    print(f"{r2} Themes: {themes_r2}")
    print(f"Justification: Both researchers focus on {', '.join(set(themes_r1.split(', ')) & set(themes_r2.split(', ')))}.\n")

# Visualization 2: Heatmap of Researcher Similarity
plt.figure(figsize=(12, 10))
sns.heatmap(sim_df, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title("Researcher Theme Similarity Heatmap")
plt.savefig('researcher_similarity_heatmap.png')
plt.close()

print("Visualizations saved as 'collaboration_network.png' and 'researcher_similarity_heatmap.png'.")


=== Question a: Interdisciplinary Collaboration Opportunities ===

Collaboration 1:
Group: Aman Madaan, Zhixuan Zhou, Nan Jiang
Average Theme Similarity: 0.5999
Justification:
Collaboration between Aman Madaan, Zhixuan Zhou, and Nan Jiang:
This trio combines NLP, reasoning optimization, and quantum physics, enabling novel applications like AI-driven scientific discovery in multilingual contexts.

Collaboration 2:
Group: Chen Jia, Wei Bi, Irina Temnikova
Average Theme Similarity: 0.5996
Justification:
Collaboration between Chen Jia, Wei Bi, and Irina Temnikova:
This trio combines NLP, reasoning optimization, and quantum physics, enabling novel applications like AI-driven scientific discovery in multilingual contexts.

Collaboration 3:
Group: Xiang Li0, Hao Tang, Rujun Han
Average Theme Similarity: 0.5987
Justification:
Collaboration between Xiang Li0, Hao Tang, and Rujun Han:
- Rujun Han excels in deep learning for quantum physics (e.g., neural network-based QMC), bringing computationa