In [3]:
import pandas as pd
from collections import Counter
import re
import glob

# ---- FIND ALL CSV FILES ----
file_paths = glob.glob("*.csv")  # change path if your CSVs are in another folder
print("Found CSV files:", file_paths)

# ---- HELPER FUNCTION ----
def extract_words_from_columns(df, columns):
    words = []
    for col in df.columns:
        if col.lower() in columns:
            for text in df[col].dropna().astype(str):
                # extract words (letters only)
                tokens = re.findall(r'\b[a-zA-Z]+\b', text.lower())
                words.extend(tokens)
    return words

# ---- MAIN ----
all_words = []

for file in file_paths:
    try:
        df = pd.read_csv(file)
        all_words.extend(extract_words_from_columns(df, {"phd" or "Phd"}))
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

# ---- COUNT AND DISPLAY ----
counter = Counter(all_words)
most_common = counter.most_common(20)

print("\nTop 10 most common words across all CSVs:")
for word, freq in most_common:
    print(f"{word}: {freq}")


Found CSV files: ['iiserkol_facultybio.csv', 'nisersms_faculty.csv', 'niserscs_faculty.csv', 'iiserkol_facultymath.csv', 'iiserkol_facultychem.csv', 'nisersps_faculty.csv', 'iiserb_physics_faculty_full.csv', 'nisersbs_faculty.csv', 'iiserkol_facultyphy.csv', 'iiserbmath_faculty_phd_postdoc.csv', 'faculty_bhopbio.csv', 'faculty_with_phd.csv', 'iiserbpr_faculty.csv', 'iiserbchem_fac.csv']

Top 10 most common words across all CSVs:
of: 261
phd: 207
institute: 158
d: 133
ph: 131
university: 129
indian: 90
in: 84
india: 78
science: 73
and: 63
bangalore: 60
physics: 57
research: 55
chemistry: 54
the: 51
prof: 47
from: 39
for: 35
sciences: 35


In [4]:
import pandas as pd
from collections import Counter
import re
import glob

# ---- FIND ALL CSV FILES ----
file_paths = glob.glob("*.csv")  # change path if your CSVs are in another folder
print("Found CSV files:", file_paths)

# ---- HELPER FUNCTION ----
def extract_words_from_columns(df, columns):
    words = []
    for col in df.columns:
        if col.lower() in columns:
            for text in df[col].dropna().astype(str):
                # extract words (letters only)
                tokens = re.findall(r'\b[a-zA-Z]+\b', text.lower())
                words.extend(tokens)
    return words

# ---- MAIN ----
all_words = []

for file in file_paths:
    try:
        df = pd.read_csv(file)
        all_words.extend(extract_words_from_columns(df, {"postdoc" or "Postdoc"}))
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

# ---- COUNT AND DISPLAY ----
counter = Counter(all_words)
most_common = counter.most_common(20)

print("\nTop 10 most common words across all CSVs:")
for word, freq in most_common:
    print(f"{word}: {freq}")


Found CSV files: ['iiserkol_facultybio.csv', 'nisersms_faculty.csv', 'niserscs_faculty.csv', 'iiserkol_facultymath.csv', 'iiserkol_facultychem.csv', 'nisersps_faculty.csv', 'iiserb_physics_faculty_full.csv', 'nisersbs_faculty.csv', 'iiserkol_facultyphy.csv', 'iiserbmath_faculty_phd_postdoc.csv', 'faculty_bhopbio.csv', 'faculty_with_phd.csv', 'iiserbpr_faculty.csv', 'iiserbchem_fac.csv']

Top 10 most common words across all CSVs:
postdoctoral: 128
of: 99
university: 83
fellow: 77
research: 51
usa: 44
institute: 37
post: 36
doctoral: 36
and: 32
the: 30
for: 22
associate: 21
at: 19
in: 19
postdoc: 16
prof: 15
researcher: 15
department: 15
s: 14


In [None]:
import pandas as pd
from collections import Counter
import re
import glob

# ---- FIND ALL CSV FILES ----
file_paths = glob.glob("*.csv")
print("Found CSV files:", file_paths)

# ---- HELPER FUNCTIONS ----
def clean_text(text):
    """Remove extra info, emails, punctuation, years, etc."""
    text = re.sub(r'\b\d{4}\b', '', text)          # remove years like 2015
    text = re.sub(r'\([^)]*\)', '', text)          # remove anything in (...)
    text = re.sub(r'[-–/,;]', ' ', text)           # replace separators with spaces
    text = re.sub(r'\s+', ' ', text)               # normalize whitespace
    text = re.sub(r'\b(ph\.?d\.?|post\s*doc|postdoctoral|doctoral)\b', '', text, flags=re.I)
    return text.strip()

def extract_institutes(text):
    """Extract likely institute names, keeping science/tech/IIT/IISc/IISER words."""
    text = clean_text(text)

    # keep key institution-related words
    keywords = [
        "University", "Institute", "College", "Centre", "Center",
        "Laboratory", "School", "Academy", "IIT", "IISc", "IISER",
        "NIT", "IIIT", "Science", "Technology", "Research"
    ]

    # pattern: capitalized phrases ending with keywords (case-insensitive)
    pattern = (
        r'([A-Z][A-Za-z&.\s\-]*(' + '|'.join(keywords) + r')[A-Za-z&.\s\-]*)'
    )

    matches = re.findall(pattern, text, flags=re.IGNORECASE)
    institutes = [m[0].strip() for m in matches]
    return institutes

def extract_from_columns(df, columns):
    institutes = []
    for col in df.columns:
        if col.lower() in columns:
            for text in df[col].dropna().astype(str):
                institutes.extend(extract_institutes(text))
    return institutes

# ---- MAIN ----
all_institutes = []

for file in file_paths:
    try:
        df = pd.read_csv(file)
        all_institutes.extend(extract_from_columns(df, {"phd" or "Phd"}))
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

# ---- CLEAN & COUNT ----
# Normalize capitalization (IIT -> IIT, Iisc -> IISc)
normalized = [re.sub(r'\biisc\b', 'IISc', inst, flags=re.I).title() for inst in all_institutes]
counter = Counter(normalized)
most_common = counter.most_common(20)

print("\nTop 20 most common institutes:")
for name, freq in most_common:
    print(f"{name}: {freq}")


Found CSV files: ['iiserkol_facultybio.csv', 'nisersms_faculty.csv', 'niserscs_faculty.csv', 'iiserkol_facultymath.csv', 'iiserkol_facultychem.csv', 'nisersps_faculty.csv', 'iiserb_physics_faculty_full.csv', 'nisersbs_faculty.csv', 'iiserkol_facultyphy.csv', 'iiserbmath_faculty_phd_postdoc.csv', 'faculty_bhopbio.csv', 'faculty_with_phd.csv', 'iiserbpr_faculty.csv', 'iiserbchem_fac.csv']

Top 20 most common institutes:
Indian Institute Of Science Bangalore: 9
Indian Institute Of Science: 6
Tata Institute Of Fundamental Research: 4
Tata Institute Of Fundamental Research Mumbai: 4
In From Indian Institute Of Science Bangalore India: 3
In From Indian Institute Of Technology Kanpur India: 3
Panjab University: 3
Jadavpur University: 3
Indian Institute Of Science Bangalore India: 3
Bose Institute: 2
Harish Chandra Research Institute: 2
Indian Institute Of Technology Kanpur India: 2
In Biology From Department Of Biological Sciences Tata Institute Of Fundamental Research Mumbai .: 2
Indian Inst

In [9]:
import pandas as pd
from collections import Counter
import re
import glob

# ---- FIND ALL CSV FILES ----
file_paths = glob.glob("*.csv")
print("Found CSV files:", file_paths)

# ---- HELPER FUNCTIONS ----
def clean_text(text):
    """Remove extra info, emails, punctuation, years, etc."""
    text = re.sub(r'\b\d{4}\b', '', text)          # remove years like 2015
    text = re.sub(r'\([^)]*\)', '', text)          # remove anything in (...)
    text = re.sub(r'[-–/,;]', ' ', text)           # replace separators with spaces
    text = re.sub(r'\s+', ' ', text)               # normalize whitespace
    text = re.sub(r'\b(ph\.?d\.?|post\s*doc|postdoctoral|doctoral)\b', '', text, flags=re.I)
    return text.strip()

def extract_institutes(text):
    """Extract likely institute names, keeping science/tech/IIT/IISc/IISER words."""
    text = clean_text(text)

    # keep key institution-related words
    keywords = [
        "University", "Institute", "College", "Centre", "Center",
        "Laboratory", "School", "Academy", "IIT", "IISc", "IISER",
        "NIT", "IIIT", "Science", "Technology", "Research"
    ]

    # pattern: capitalized phrases ending with keywords (case-insensitive)
    pattern = (
        r'([A-Z][A-Za-z&.\s\-]*(' + '|'.join(keywords) + r')[A-Za-z&.\s\-]*)'
    )

    matches = re.findall(pattern, text, flags=re.IGNORECASE)
    institutes = [m[0].strip() for m in matches]
    return institutes

def extract_from_columns(df, columns):
    institutes = []
    for col in df.columns:
        if col.lower() in columns:
            for text in df[col].dropna().astype(str):
                institutes.extend(extract_institutes(text))
    return institutes

# ---- MAIN ----
all_institutes = []

for file in file_paths:
    try:
        df = pd.read_csv(file)
        all_institutes.extend(extract_from_columns(df, {"postdoc" or "Postdoc"}))
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

# ---- CLEAN & COUNT ----
# Normalize capitalization (IIT -> IIT, Iisc -> IISc)
normalized = [re.sub(r'\biisc\b', 'IISc', inst, flags=re.I).title() for inst in all_institutes]
counter = Counter(normalized)
most_common = counter.most_common(20)

print("\nTop 20 most common institutes:")
for name, freq in most_common:
    print(f"{name}: {freq}")


Found CSV files: ['iiserkol_facultybio.csv', 'nisersms_faculty.csv', 'niserscs_faculty.csv', 'iiserkol_facultymath.csv', 'iiserkol_facultychem.csv', 'nisersps_faculty.csv', 'iiserb_physics_faculty_full.csv', 'nisersbs_faculty.csv', 'iiserkol_facultyphy.csv', 'iiserbmath_faculty_phd_postdoc.csv', 'faculty_bhopbio.csv', 'faculty_with_phd.csv', 'iiserbpr_faculty.csv', 'iiserbchem_fac.csv']

Top 20 most common institutes:
Fellow University Of California Riverside: 2
Researcher In The Materials Research Laboratory And Mitsubishi Chemicals Center For Advanced Materials University Of California Santa Barbara With Prof.S J. H. Waite And G. D. Stucky.: 2
Welcome Trust  Researcher In Department Of Biochemistry Oxford University Uk With Dr. Mark Howarth.: 2
Research Fellow University Of Michgan Ann Arbor Mi Usa Advisor: 2
Researcher University Of Maryland College Park U.S.A.: 2
Tsri  Fellow The Scripps Research Institute Scripps Florida Usa .: 2
Tohoku University Japan: 2
Research At Johns Hopkin