In [3]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np

uri = "bolt://localhost:7687"
user = "neo4j"
password = "12345678"

driver = GraphDatabase.driver(uri, auth=(user, password))

query = """
MATCH (a:Author)
WHERE a.n2vEmbedding is not null AND a.domain is not null
RETURN id(a) AS id, a.n2vEmbedding AS features, a.domain AS label
"""

with driver.session() as session:
    results = session.run(query)
    records = [(r["id"], r["features"], r["label"]) for r in results]

driver.close()

# Convert to DataFrame
df = pd.DataFrame(records, columns=["id", "features", "label"])
df["features"] = df["features"].apply(np.array)

# Expand features to separate columns
features_df = pd.DataFrame(df["features"].tolist())
full_df = pd.concat([df[["id", "label"]], features_df], axis=1)

print(full_df.head())




   id              label         0         1         2         3         4  \
0   0           Medicine  0.003019  0.002637  0.002523  0.002926 -0.001466   
1   1  Political Science  0.272338 -0.174784 -0.132844  0.018287  0.046531   
2   2  Political Science  0.311223 -0.192921 -0.083063  0.056526  0.014732   
3   3  Political Science  0.255202 -0.145371 -0.087290 -0.005230  0.045122   
4   4          Sociology  0.003016  0.000123 -0.003092 -0.003878  0.002291   

          5         6         7  ...       118       119       120       121  \
0  0.000360 -0.002579  0.001358  ...  0.000762  0.000124  0.000113  0.003515   
1 -0.416449 -0.178818  1.695201  ...  0.428169  0.165190 -0.736638 -0.333500   
2 -0.370717 -0.212007  1.583922  ...  0.413350  0.182450 -0.687881 -0.362872   
3 -0.344951 -0.169225  1.385602  ...  0.357039  0.142919 -0.599032 -0.286520   
4 -0.002050 -0.000125 -0.000048  ... -0.002238 -0.002664  0.000298  0.003789   

        122       123       124       125       12

In [None]:
# Define a mapping of specific labels to broader categories
group_map = {
    "Art": "Humanities",
    "Philosophy": "Humanities",
    "History": "Humanities",
    "Political Science": "Social Sciences",
    "Sociology": "Social Sciences",
    "Psychology": "Social Sciences",
    "Economics": "Social Sciences",
    "Business": "Social Sciences",
    "Biology": "Natural Sciences",
    "Medicine": "Life Sciences",
    "Engineering": "Applied Sciences",
    "Computer Science": "Applied Sciences",
    "Geography": "Natural Sciences",
    "Materials Science": "Applied Sciences",
    "Mathematics": "Formal Sciences"
}

# Apply the mapping
full_df["label_grouped"] = full_df["label"].map(group_map).fillna(full_df["label"])


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Drop unwanted columns and define X and y using grouped labels
X = full_df.drop(columns=["id", "label", "label_grouped"])
y = full_df["label_grouped"]

label_counts = y.value_counts()
valid_labels = label_counts[label_counts >= 2].index

filtered_df = full_df[full_df["label_grouped"].isin(valid_labels)]
X = filtered_df.drop(columns=["id", "label", "label_grouped"])
y = filtered_df["label_grouped"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



                  precision    recall  f1-score   support

Applied Sciences       1.00      0.12      0.22         8
      Humanities       0.00      0.00      0.00        31
   Life Sciences       0.94      0.69      0.80      1177
Natural Sciences       0.80      0.22      0.34        91
 Social Sciences       0.90      0.99      0.94      4124

        accuracy                           0.90      5431
       macro avg       0.73      0.40      0.46      5431
    weighted avg       0.90      0.90      0.89      5431



In [7]:
from imblearn.over_sampling import RandomOverSampler

X_resampled, y_resampled = RandomOverSampler(random_state=42).fit_resample(X, y)

# Now train-test split on balanced data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, stratify=y_resampled, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


                  precision    recall  f1-score   support

Applied Sciences       1.00      1.00      1.00      4124
         Geology       1.00      1.00      1.00      4124
      Humanities       1.00      1.00      1.00      4124
   Life Sciences       0.97      1.00      0.98      4124
Natural Sciences       1.00      1.00      1.00      4124
         Physics       1.00      1.00      1.00      4124
 Social Sciences       1.00      0.97      0.98      4124

        accuracy                           1.00     28868
       macro avg       1.00      1.00      1.00     28868
    weighted avg       1.00      1.00      1.00     28868



In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Train-test split (after resampling)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, stratify=y_resampled, test_size=0.2, random_state=42
)

# Train Logistic Regression
log_reg = LogisticRegression(
    multi_class="multinomial",  # for softmax behavior
    solver="lbfgs",             # efficient for multinomial
    max_iter=1000,
    class_weight="balanced"     # helps with any residual imbalance
)
log_reg.fit(X_train, y_train)

# Evaluate
y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))




                  precision    recall  f1-score   support

Applied Sciences       0.80      0.61      0.69      4124
         Geology       1.00      1.00      1.00      4124
      Humanities       0.44      0.96      0.60      4124
   Life Sciences       0.52      0.45      0.48      4124
Natural Sciences       0.61      0.56      0.58      4124
         Physics       1.00      1.00      1.00      4124
 Social Sciences       0.43      0.13      0.20      4124

        accuracy                           0.67     28868
       macro avg       0.69      0.67      0.65     28868
    weighted avg       0.69      0.67      0.65     28868



In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

le = LabelEncoder()
y_encoded = le.fit_transform(y_resampled)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_encoded, stratify=y_encoded, test_size=0.2, random_state=42
)

xgb_clf = XGBClassifier(
    objective="multi:softmax",
    num_class=len(le.classes_),
    eval_metric="mlogloss",
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)
xgb_clf.fit(X_train, y_train)

y_pred_encoded = xgb_clf.predict(X_test)
y_pred = le.inverse_transform(y_pred_encoded)
y_test_str = le.inverse_transform(y_test)

print(classification_report(y_test_str, y_pred))



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                  precision    recall  f1-score   support

Applied Sciences       1.00      1.00      1.00      4124
         Geology       1.00      1.00      1.00      4124
      Humanities       1.00      1.00      1.00      4124
   Life Sciences       0.95      0.97      0.96      4124
Natural Sciences       1.00      1.00      1.00      4124
         Physics       1.00      1.00      1.00      4124
 Social Sciences       0.97      0.94      0.96      4124

        accuracy                           0.99     28868
       macro avg       0.99      0.99      0.99     28868
    weighted avg       0.99      0.99      0.99     28868

