In [5]:
# --- Step 1: Import libraries ---
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# --- Step 2: Load all CSVs automatically ---
file_paths = glob.glob("*.csv")  # adjust path if needed, e.g. "data/*.csv"

print("📂 Found CSV files:", file_paths)

dfs = []
for path in file_paths:
    df = pd.read_csv(path)
    
    # Add column indicating which CSV (study) this came from
    df["source"] = path.split("/")[-1]
    
    dfs.append(df)

# Combine all data
data = pd.concat(dfs, ignore_index=True)
print("✅ Total data shape:", data.shape)
print("✅ Sources found:", data['source'].unique())

# --- Step 3: Prepare text column ---
# Combine PhD and Postdoc columns into a single text column
data["text"] = data["phd"].astype(str) + " " + data["postdoc"].astype(str)

# Drop rows with missing text
data = data.dropna(subset=["text"])
print("✅ Example text data:")
print(data["text"].head())

# --- Step 4: Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(
    data["text"], data["source"], test_size=0.2, random_state=42, stratify=data["source"]
)

# --- Step 5: TF-IDF vectorization ---
vectorizer = TfidfVectorizer(stop_words="english", max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# --- Step 6: Train Random Forest ---
rf = RandomForestClassifier(
    n_estimators=200,        # number of trees
    random_state=42,
    n_jobs=-1,
    max_depth=None
)
rf.fit(X_train_tfidf, y_train)

# --- Step 7: Evaluate ---
y_pred = rf.predict(X_test_tfidf)

print("\n🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# --- Step 8: Try a prediction ---
sample = ["PhD (Physics), Department of Physics (Indian Institute of Science, Bangalore), 2005,Postdoctoral Fellow, Tata Institute of Fundamental Research, Mumbai, Indi"]
sample_tfidf = vectorizer.transform(sample)
predicted_source = rf.predict(sample_tfidf)[0]
print("\n🔮 Predicted CSV (source):", predicted_source)


📂 Found CSV files: ['iiserkol_facultybio.csv', 'nisersms_faculty.csv', 'niserscs_faculty.csv', 'iiserkol_facultymath.csv', 'iiserkol_facultychem.csv', 'nisersps_faculty.csv', 'nisersbs_faculty.csv', 'iiserkol_facultyphy.csv', 'faculty_with_phd.csv']
✅ Total data shape: (618, 9)
✅ Sources found: ['iiserkol_facultybio.csv' 'nisersms_faculty.csv' 'niserscs_faculty.csv'
 'iiserkol_facultymath.csv' 'iiserkol_facultychem.csv'
 'nisersps_faculty.csv' 'nisersbs_faculty.csv' 'iiserkol_facultyphy.csv'
 'faculty_with_phd.csv']
✅ Example text data:
0              PhD (Biochemistry), IIT Delhi, 1990 nan
1    PhD (Biotechnology), Interdisciplinary Biotech...
2    PhD (Biophysical Chemistry), Bose Institute (J...
3    PhD (Animal Behaviour), Centre for Ecological ...
4    PhD (Dominance Hierarchy in a social wasp), In...
Name: text, dtype: object

🎯 Accuracy: 0.6129032258064516

📊 Classification Report:
                           precision    recall  f1-score   support

    faculty_with_phd.csv      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
