In [11]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [33]:
file_paths = glob.glob("*.csv")

dfs = []
for path in file_paths:
    df = pd.read_csv(path)  # use a consistent name
    df["source"] = path.split("/")[-1]  # add a 'source' column with the file name
    dfs.append(df)

# Merge all dataframes into one
data = pd.concat(dfs, ignore_index=True)
print("✅ Loaded data shape:", data.shape)
data.head()


✅ Loaded data shape: (618, 9)


Unnamed: 0,name,profile,department,research_area,email,phd,postdoc,source,title
0,Sunil Kumar Khare,https://www.iiserkol.ac.in/web/en/people/facul...,Biological Sciences,"Biochemistry, Chemistry and biology, Microbial...",skkhare [AT] iiserkol.ac.in,"PhD (Biochemistry), IIT Delhi, 1990",,iiserkol_facultybio.csv,
1,Amirul Islam Mallick,https://www.iiserkol.ac.in/web/en/people/facul...,Biological Sciences,"Host-pathogen interaction, Molecular Immunolog...",amallick [AT] iiserkol.ac.in,"PhD (Biotechnology), Interdisciplinary Biotech...",,iiserkol_facultybio.csv,
2,Amit Kumar Mandal,https://www.iiserkol.ac.in/web/en/people/facul...,Biological Sciences,Molecular Medicine - Structural proteomics and...,amitkm [AT] iiserkol.ac.in,"PhD (Biophysical Chemistry), Bose Institute (J...","Postdoctoral Fellow, University of Texas Healt...",iiserkol_facultybio.csv,
3,Anindita Bhadra,https://www.iiserkol.ac.in/web/en/people/facul...,Biological Sciences,"Animal behaviour, ecology and evolution",abhadra [AT] iiserkol.ac.in,"PhD (Animal Behaviour), Centre for Ecological ...",,iiserkol_facultybio.csv,
4,Annagiri Sumana,https://www.iiserkol.ac.in/web/en/people/facul...,Biological Sciences,Behaviour and Ecology,sumana [AT] iiserkol.ac.in,"PhD (Dominance Hierarchy in a social wasp), In...",,iiserkol_facultybio.csv,


In [34]:
data["text"] = data["phd"].astype(str) + " " + data["postdoc"].astype(str)

# Drop empty or NaN text rows
data = data.dropna(subset=["text"])
print("✅ Combined text example:")
print(data["text"].head())

✅ Combined text example:
0              PhD (Biochemistry), IIT Delhi, 1990 nan
1    PhD (Biotechnology), Interdisciplinary Biotech...
2    PhD (Biophysical Chemistry), Bose Institute (J...
3    PhD (Animal Behaviour), Centre for Ecological ...
4    PhD (Dominance Hierarchy in a social wasp), In...
Name: text, dtype: object


In [35]:
X = data["text"]
y = data["source"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Train samples:", len(X_train))
print("✅ Test samples:", len(X_test))


✅ Train samples: 494
✅ Test samples: 124


In [36]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("✅ TF-IDF vectorized shape:", X_train_tfidf.shape)


✅ TF-IDF vectorized shape: (494, 703)


In [37]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print("✅ Model trained.")

✅ Model trained.


In [22]:
print("🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n")
print(classification_report(y_test, y_pred))

🎯 Accuracy: 0.5967741935483871

📄 Classification Report:

                          precision    recall  f1-score   support

    faculty_with_phd.csv       0.59      1.00      0.74        36
 iiserkol_facultybio.csv       0.00      0.00      0.00         3
iiserkol_facultychem.csv       0.00      0.00      0.00         3
iiserkol_facultymath.csv       0.00      0.00      0.00         2
 iiserkol_facultyphy.csv       1.00      0.33      0.50         3
    nisersbs_faculty.csv       0.00      0.00      0.00         3
    niserscs_faculty.csv       0.00      0.00      0.00         3
    nisersms_faculty.csv       0.00      0.00      0.00         4
    nisersps_faculty.csv       0.00      0.00      0.00         5

                accuracy                           0.60        62
               macro avg       0.18      0.15      0.14        62
            weighted avg       0.39      0.60      0.46        62



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
new_phd = "  Max-Planck Institute for Mathematics in  (University of Leipzig), Max-Planck Institute for Molecular Genetics"
new_postdoc = ""

# Combine text
new_text = [new_phd + " " + new_postdoc]
new_vec = vectorizer.transform(new_text)

predicted_source = model.predict(new_vec)[0]
print("🔍 Predicted CSV belongs to:", predicted_source)

🔍 Predicted CSV belongs to: faculty_with_phd.csv
