In [1]:
!pip install -q gdown

In [2]:
from pathlib import Path
import re
import os
from typing import List, Dict, Any
import csv
import shutil
import pandas as pd
import plotly.express as px
from tqdm import tqdm
from collections import defaultdict, Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
import gdown
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from google.colab import drive

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
folder_url = "https://drive.google.com/drive/folders/1vx3IolDNtoBaU_TM3uUHGXN6QQqOkd9v?usp=drive_link"

local_root = "/content/review_data"

gdown.download_folder(folder_url, output=local_root, quiet=False)
print(os.listdir(local_root))

Retrieving folder contents


Processing file 17Kwr-R3cz1DdE371O3nckdhs6CqvARtr new_list.csv
Processing file 1iG2B03nWxsixhWjw5p2gi6d5_TDCpx24 old_list.csv


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=17Kwr-R3cz1DdE371O3nckdhs6CqvARtr
To: /content/review_data/new_list.csv
100%|██████████| 1.66M/1.66M [00:00<00:00, 126MB/s]
Downloading...
From: https://drive.google.com/uc?id=1iG2B03nWxsixhWjw5p2gi6d5_TDCpx24
To: /content/review_data/old_list.csv
100%|██████████| 479k/479k [00:00<00:00, 87.1MB/s]

['Ai_literacy_notebook.ipynb', 'old_list.csv', 'new_list.csv']



Download completed


In [8]:
OLD_FILE = os.path.join(local_root, "old_list.csv")
NEW_FILE = os.path.join(local_root, "new_list.csv")

In [9]:
OLD_FILE,NEW_FILE

('/content/review_data/old_list.csv', '/content/review_data/new_list.csv')

### File structure

**Types of review variables**

We want to treat each of the types of review as a different variable. Right we have done that for "systematic" and "meta analysis". Let's add the other types of reviews. I think the easiest way to do this for now is to remove the word "review" and just look at the key words:

*title keywords*: "systematic", "scoping", "survey", "overview", "summary" (and meta analysis, written both ways)

We can just check within the titles for these terms (for now). I see you've made these counts / tables below, so let's do the following for the whole set of terms:

*   N times each term appears in the titles
*   plot / table of term frequency by year

**Types of AI variables**

Let's make variables for different types of AI. Three variables (but multiple keywords in each variable). One variable for "AIEd" or "AI-ED"; one variable for "generative artificial intelligence" or " or "gen-AI" or "generative AI"; and one variable for just AI and "artifificial intelligence" (but neither of the others two terms). Let's make variables for each both titles and abtracts respectively.
It would be nice to see

* N times each of the 3 term clusters terms appears in the titles
* N times each term apperas in the abstracts
* plot / table of term frequency by year (title alone, abstract alone, and occuring in either title or abstract)
* We can try treating the three clusters as subsets that we want to see if there are "differences" between looking at the TF IDF, cosine, and other NLP comparisons on

**Excluding search terms**

I like your idea of manually defining keywords and excluding them in the TF IDF, this let's us see "what else" are the main topics. (Because of course the terms we said had to be included would be some of the top terms.)

Let's try and exclude all these terms, in addition to the review type variables: "literacy" "educat*" "student*" "school" "learner" "learning outcomes"

I don't know if the wild card operator * works in python. If not, we can exclude "education" and "student" "students"

Let's see what the TF IDF and also clustering approaches you're doing look like excluding these (and we can compare to the results we got not excluding them).

In [50]:
COL_KEY = "Key"
COL_DOI = "DOI"
COL_YEAR = "Publication Year"
COL_TITLE = "Title"
COL_ABSTRACT = "Abstract Note"

# keywords: "systematic", "scoping", "survey", "overview", "summary"
# each of these key words is their own variable

# other keywords: (these are relevant to excluding in excluded TF IDF later on, but don't need to be compared in analyses in titles / abstracts with the above terms)
# other: "AIEd" , "AI" "artificial intelligence"
# other: "literacy" "educat*" "student*" "school" "learner" "learning outcomes"

KEYWORDS = {
    "systematic_review": ["systematic"],
    "scoping": ["scoping"],
    "survey": ["survey"],
    "overview": ["overview"],
    "summary": ["summary"],
    "meta_analysis": ["meta-analysis", "meta analysis"] # keep this as is
}

AI_KEYWORDS = {
    "Ai_Ed": ["AIEd", "Ai Ed"],
    "AI": ["Ai", "Artificial Intelligence"],
    "gen_AI": ["generative artificial intelligence", "gen-AI", "genAI", "generative AI"]
}

ALL_KEYWORDS = {}
ALL_KEYWORDS.update(KEYWORDS)
ALL_KEYWORDS.update(AI_KEYWORDS)

### Merge files and duplicate check

In [41]:
def load_and_merge(old_file, new_file):
    df_old = pd.read_csv(old_file)
    df_new = pd.read_csv(new_file)

    df_old["source_list"] = "old"
    df_new["source_list"] = "new"


    cols_to_keep = [COL_KEY, COL_DOI, COL_YEAR, COL_TITLE, COL_ABSTRACT, "source_list"]
    df_old = df_old[cols_to_keep]
    df_new = df_new[cols_to_keep]

    df = pd.concat([df_old, df_new], ignore_index=True)

    return df

df = load_and_merge(OLD_FILE, NEW_FILE)

In [42]:
dup_keys = df[COL_KEY][df[COL_KEY].duplicated()]
if not dup_keys.empty:
    print("Duplicated Keys found")
    print(dup_keys.head())
else:
    print("Only unique Keys")

Only unique Keys


### simple text cleaning

In [43]:
def basic_clean(text: str) -> str:
    if not isinstance(text, str):
        return ""

    text = text.lower()

    text = re.sub(r"\s+", " ", text) # whitespace

    text = re.sub(r"[^\w\s]", " ", text) # punctuation

    text = re.sub(r"\d+", " ", text) #numbers

    text = re.sub(r"[^a-z\s]", " ", text)

    text = text.strip()
    return text


def add_clean_text_columns(df: pd.DataFrame) -> pd.DataFrame:
    df["title_clean"] = df[COL_TITLE].apply(basic_clean)
    df["abstract_clean"] = df[COL_ABSTRACT].apply(basic_clean)
    return df

df = add_clean_text_columns(df)

In [44]:
def create_keyword_features(df: pd.DataFrame, keywords_dict: dict) -> pd.DataFrame:
    """
    For each keyword group:
      - indicator: whether any of the phrases appears in title/abstract
      - count: total occurrences in title/abstract
    """
    for label, phrase_list in keywords_dict.items():
        # Title features
        ind_col_title = f"has_{label}_title"
        cnt_col_title = f"count_{label}_title"

        # Abstract features
        ind_col_abs = f"has_{label}_abstract"
        cnt_col_abs = f"count_{label}_abstract"

        def count_occurrences(text: str, phrases) -> int:
            text = text.lower()
            total = 0
            for p in phrases:
                # substring count
                total += text.count(p.lower())
            return total

        df[cnt_col_title] = df["title_clean"].apply(lambda t: count_occurrences(t, phrase_list))
        df[cnt_col_abs] = df["abstract_clean"].apply(lambda t: count_occurrences(t, phrase_list))

        df[ind_col_title] = (df[cnt_col_title] > 0).astype(int)
        df[ind_col_abs] = (df[cnt_col_abs] > 0).astype(int)

    return df

df = create_keyword_features(df, KEYWORDS)
df[:5]

Unnamed: 0,Key,DOI,Publication Year,Title,Abstract Note,source_list,title_clean,abstract_clean,count_systematic_review_title,count_systematic_review_abstract,...,has_overview_title,has_overview_abstract,count_summary_title,count_summary_abstract,has_summary_title,has_summary_abstract,count_meta_analysis_title,count_meta_analysis_abstract,has_meta_analysis_title,has_meta_analysis_abstract
0,RIMXHGKB,10.1016/j.caeo.2024.100173,2024.0,A systematic review of AI literacy conceptuali...,The explosion of AI across all facets of socie...,old,a systematic review of ai literacy conceptuali...,the explosion of ai across all facets of socie...,1,2,...,0,0,0,0,0,0,0,0,0,0
1,BKWIU8LW,10.1108/EJTD-09-2021-0143,2023.0,Artificial intelligence in learning and develo...,Purpose The presented research explored artifi...,old,artificial intelligence in learning and develo...,purpose the presented research explored artifi...,1,3,...,0,0,0,0,0,0,0,0,0,0
2,JPMFG28C,10.1016/j.eswa.2024.124167,2024.0,Artificial intelligence in education: A system...,Artificial intelligence (AI) in education (AIE...,old,artificial intelligence in education a system...,artificial intelligence ai in education aie...,1,0,...,0,0,0,0,0,0,0,0,0,0
3,768ETFLW,10.1080/09523987.2023.2264990,2023.0,The use of Artificial intelligence in school s...,Artificial Intelligence is widely used across ...,old,the use of artificial intelligence in school s...,artificial intelligence is widely used across ...,1,1,...,0,0,0,0,0,0,0,0,0,0
4,MJDGH7HH,10.29333/iejme/12132,2022.0,Artificial intelligence in mathematics educati...,The advancement of technology like artificial ...,old,artificial intelligence in mathematics educati...,the advancement of technology like artificial ...,1,2,...,0,1,0,0,0,0,0,0,0,0


In [None]:
#df.to_csv("simple_cleaned.csv")

### features on the main keywords

In [16]:
review_labels = list(KEYWORDS.keys())

title_term_counts = (
    pd.Series(
        {
            label: df[f"count_{label}_title"].sum()
            for label in review_labels
        }
    )
    .rename("n_occurrences_in_titles")
    .sort_values(ascending=False)
)

title_term_counts.to_frame()

Unnamed: 0,n_occurrences_in_titles
systematic_review,652
scoping,153
meta_analysis,87
survey,5
overview,2
summary,0


In [18]:
abs_term_counts = (
    pd.Series(
        {
            label: df[f"count_{label}_abstract"].sum()
            for label in review_labels
        }
    )
    .rename("n_occurrences_in_abstract")
    .sort_values(ascending=False)
)

abs_term_counts.to_frame()

Unnamed: 0,n_occurrences_in_abstract
systematic_review,944
scoping,202
meta_analysis,165
overview,78
survey,44
summary,12


In [19]:
labels = list(KEYWORDS.keys()) #all keywords group labels

df["keyword_groups_title_list"] = df.apply(
    lambda row: [
        label
        for label in labels
        if row[f"has_{label}_title"] == 1
    ],
    axis=1,
)

df["keyword_groups_abstract_list"] = df.apply(
    lambda row: [
        label
        for label in labels
        if row[f"has_{label}_abstract"] == 1
    ],
    axis=1,
)

df["keyword_groups_any_list"] = df.apply( # kewwords in either abstract or title
    lambda row: [
        label
        for label in labels
        if (row[f"has_{label}_title"] == 1) or (row[f"has_{label}_abstract"] == 1)
    ],
    axis=1,
)

df["n_keyword_groups_title"] = df["keyword_groups_title_list"].str.len()
df["n_keyword_groups_abstract"] = df["keyword_groups_abstract_list"].str.len()
df["n_keyword_groups_any"] = df["keyword_groups_any_list"].str.len()

count_cols_title = [f"count_{label}_title" for label in labels]
count_cols_abs = [f"count_{label}_abstract" for label in labels]

df["n_phrase_matches_title"] = df[count_cols_title].sum(axis=1)
df["n_phrase_matches_abstract"] = df[count_cols_abs].sum(axis=1)

cols_to_show = [
    COL_KEY,                    # ID
    "keyword_groups_title_list",
    "keyword_groups_abstract_list",
    "keyword_groups_any_list",
    "n_keyword_groups_any",
]

df[cols_to_show].head(5)


Unnamed: 0,Key,keyword_groups_title_list,keyword_groups_abstract_list,keyword_groups_any_list,n_keyword_groups_any
0,RIMXHGKB,[systematic_review],[systematic_review],[systematic_review],1
1,BKWIU8LW,[systematic_review],[systematic_review],[systematic_review],1
2,JPMFG28C,[systematic_review],[],[systematic_review],1
3,768ETFLW,[systematic_review],[systematic_review],[systematic_review],1
4,MJDGH7HH,[systematic_review],"[systematic_review, overview]","[systematic_review, overview]",2


In [20]:
has_any = df["n_keyword_groups_any"] > 0 # rows that have at least one phrase

df_has_any = df.loc[has_any, [COL_KEY, "keyword_groups_any_list", "n_keyword_groups_any"]]
df_has_any.head(10)


Unnamed: 0,Key,keyword_groups_any_list,n_keyword_groups_any
0,RIMXHGKB,[systematic_review],1
1,BKWIU8LW,[systematic_review],1
2,JPMFG28C,[systematic_review],1
3,768ETFLW,[systematic_review],1
4,MJDGH7HH,"[systematic_review, overview]",2
5,EIGCHAD8,[systematic_review],1
6,PPKZ2FMT,[systematic_review],1
7,UE74ARX4,[systematic_review],1
8,JFVAZHDN,"[systematic_review, scoping, meta_analysis]",3
9,ME9C3EKY,[systematic_review],1


### text cleaning and lemmatizing with spacy

In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [21]:
import unicodedata
import spacy

nlp = spacy.load("en_core_web_sm")

def spacy_clean(text: str, remove_stopwords: bool = True) -> str:
    if not isinstance(text, str):
        if text is None:
            text = ""
        else:
            text = str(text)

    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", "ignore").decode("ascii")
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"\S+@\S+", " ", text)

    doc = nlp(text)

    lemmas = []
    for tok in doc:
        # only alphabetic tokens
        if not tok.is_alpha:
            continue

        # skip stopwords
        if remove_stopwords and tok.is_stop:
            continue

        lemma = tok.lemma_.lower().strip()
        if lemma:
            lemmas.append(lemma)

    return " ".join(lemmas)


In [22]:
df["title_clean"] = df["Title"].apply(spacy_clean)
df["abstract_clean"] = df["Abstract Note"].apply(spacy_clean)


In [None]:
df[:5]

Unnamed: 0,Key,DOI,Publication Year,Title,Abstract Note,source_list,title_clean,abstract_clean,count_systematic_review_title,count_systematic_review_abstract,...,has_meta_analysis_title,has_meta_analysis_abstract,keyword_groups_title_list,keyword_groups_abstract_list,keyword_groups_any_list,n_keyword_groups_title,n_keyword_groups_abstract,n_keyword_groups_any,n_phrase_matches_title,n_phrase_matches_abstract
0,RIMXHGKB,10.1016/j.caeo.2024.100173,2024.0,A systematic review of AI literacy conceptuali...,The explosion of AI across all facets of socie...,old,systematic review ai literacy conceptualizatio...,explosion ai facet society give rise need ai e...,1,2,...,0,0,[systematic_review],[systematic_review],[systematic_review],1,1,1,1,2
1,BKWIU8LW,10.1108/EJTD-09-2021-0143,2023.0,Artificial intelligence in learning and develo...,Purpose The presented research explored artifi...,old,artificial intelligence learning development s...,purpose present research explore artificial in...,1,1,...,0,0,[systematic_review],[systematic_review],[systematic_review],1,1,1,1,1
2,JPMFG28C,10.1016/j.eswa.2024.124167,2024.0,Artificial intelligence in education: A system...,Artificial intelligence (AI) in education (AIE...,old,artificial intelligence education systematic l...,artificial intelligence ai education aied evol...,1,0,...,0,0,[systematic_review],[],[systematic_review],1,0,1,1,0
3,768ETFLW,10.1080/09523987.2023.2264990,2023.0,The use of Artificial intelligence in school s...,Artificial Intelligence is widely used across ...,old,use artificial intelligence school science sys...,artificial intelligence widely contexts differ...,1,1,...,0,0,[systematic_review],[systematic_review],[systematic_review],1,1,1,1,1
4,MJDGH7HH,10.29333/iejme/12132,2022.0,Artificial intelligence in mathematics educati...,The advancement of technology like artificial ...,old,artificial intelligence mathematics education ...,advancement technology like artificial intelli...,1,2,...,0,0,[systematic_review],[systematic_review],[systematic_review],1,1,1,1,2


In [None]:
#df.to_csv("spacy_cleaned.csv")

### AI keywords

In [46]:
df.columns

Index(['Key', 'DOI', 'Publication Year', 'Title', 'Abstract Note',
       'source_list', 'title_clean', 'abstract_clean',
       'count_systematic_review_title', 'count_systematic_review_abstract',
       'has_systematic_review_title', 'has_systematic_review_abstract',
       'count_scoping_title', 'count_scoping_abstract', 'has_scoping_title',
       'has_scoping_abstract', 'count_survey_title', 'count_survey_abstract',
       'has_survey_title', 'has_survey_abstract', 'count_overview_title',
       'count_overview_abstract', 'has_overview_title',
       'has_overview_abstract', 'count_summary_title',
       'count_summary_abstract', 'has_summary_title', 'has_summary_abstract',
       'count_meta_analysis_title', 'count_meta_analysis_abstract',
       'has_meta_analysis_title', 'has_meta_analysis_abstract'],
      dtype='object')

In [48]:
df = create_keyword_features(df, AI_KEYWORDS)
df.columns

Index(['Key', 'DOI', 'Publication Year', 'Title', 'Abstract Note',
       'source_list', 'title_clean', 'abstract_clean',
       'count_systematic_review_title', 'count_systematic_review_abstract',
       'has_systematic_review_title', 'has_systematic_review_abstract',
       'count_scoping_title', 'count_scoping_abstract', 'has_scoping_title',
       'has_scoping_abstract', 'count_survey_title', 'count_survey_abstract',
       'has_survey_title', 'has_survey_abstract', 'count_overview_title',
       'count_overview_abstract', 'has_overview_title',
       'has_overview_abstract', 'count_summary_title',
       'count_summary_abstract', 'has_summary_title', 'has_summary_abstract',
       'count_meta_analysis_title', 'count_meta_analysis_abstract',
       'has_meta_analysis_title', 'has_meta_analysis_abstract',
       'count_Ai_Ed_title', 'count_Ai_Ed_abstract', 'has_Ai_Ed_title',
       'has_Ai_Ed_abstract', 'count_AI_title', 'count_AI_abstract',
       'has_AI_title', 'has_AI_abstract',

In [51]:
for part in ["title", "abstract"]:
    has_ai        = df[f"has_AI_{part}"]
    has_aied      = df[f"has_Ai_Ed_{part}"]
    has_gen_ai    = df[f"has_gen_AI_{part}"]

    plain_col     = f"has_AI_plain_{part}"
    plain_count   = f"count_AI_plain_{part}"

    df[plain_col] = ((has_ai == 1) & (has_aied == 0) & (has_gen_ai == 0)).astype(int)
    df[plain_count] = df[f"count_AI_{part}"] * df[plain_col]

ai_clusters = ["Ai_Ed", "gen_AI", "AI_plain"]

In [53]:
review_labels = list(AI_KEYWORDS.keys())

review_title_doc_counts = (
    pd.Series(
        {label: int(df[f"has_{label}_title"].sum()) for label in review_labels}
    )
    .rename("n_titles_with_term")
    .sort_values(ascending=False)
)

review_title_doc_counts.to_frame()


Unnamed: 0,n_titles_with_term
AI,955
gen_AI,147
Ai_Ed,6


In [54]:
review_abs_doc_counts = (
    pd.Series(
        {label: int(df[f"has_{label}_abstract"].sum()) for label in review_labels}
    )
    .rename("n_abstracts_with_term")
    .sort_values(ascending=False)
)
review_abs_doc_counts.to_frame()

Unnamed: 0,n_abstracts_with_term
AI,980
gen_AI,202
Ai_Ed,80


In [57]:
# total occurences of the terms within title and abstract

review_title_occ_counts = (
    pd.Series(
        {label: int(df[f"count_{label}_title"].sum()) for label in review_labels}
    )
    .rename("n_occurrences_in_titles")
    .sort_values(ascending=False)
)

review_abs_occ_counts = (
    pd.Series(
        {label: int(df[f"count_{label}_abstract"].sum()) for label in review_labels}
    )
    .rename("n_occurrences_in_abstracts")
    .sort_values(ascending=False)
)

review_title_occ_counts.to_frame()

Unnamed: 0,n_occurrences_in_titles
AI,1109
gen_AI,161
Ai_Ed,6


In [58]:
review_abs_occ_counts.to_frame()

Unnamed: 0,n_occurrences_in_abstracts
AI,9215
gen_AI,855
Ai_Ed,229


In [59]:
# term frequency by year in title and in abstract
review_year_title_counts = (
    df
    .groupby(COL_YEAR)[[f"has_{label}_title" for label in review_labels]]
    .sum()
    .rename_axis("year")
    .reset_index()
)

review_year_title_counts


Unnamed: 0,year,has_Ai_Ed_title,has_AI_title,has_gen_AI_title
0,1990.0,0,1,0
1,2016.0,0,1,0
2,2019.0,0,3,0
3,2020.0,0,4,0
4,2021.0,0,15,0
5,2022.0,0,45,0
6,2023.0,1,102,6
7,2024.0,4,272,41
8,2025.0,1,493,99
9,2026.0,0,8,1


In [60]:
review_year_abstract_counts = (
    df
    .groupby(COL_YEAR)[[f"has_{label}_abstract" for label in review_labels]]
    .sum()
    .rename_axis("year")
    .reset_index()
)

review_year_abstract_counts


Unnamed: 0,year,has_Ai_Ed_abstract,has_AI_abstract,has_gen_AI_abstract
0,1990.0,0,1,0
1,2016.0,1,1,0
2,2019.0,1,3,0
3,2020.0,1,5,0
4,2021.0,1,18,0
5,2022.0,7,50,0
6,2023.0,13,110,8
7,2024.0,33,280,55
8,2025.0,21,501,138
9,2026.0,1,8,1


In [62]:
review_year_title_long = (
    review_year_title_counts
    .melt(id_vars="year", var_name="term", value_name="n_titles")
)

review_year_title_long["term"] = (
    review_year_title_long["term"]
    .str.replace(r"^has_", "", regex=True)
    .str.replace(r"_title$", "", regex=True)
)

fig = px.line(
    review_year_title_long,
    x="year",
    y="n_titles",
    color="term",
    markers=True,
    title="Review-related keywords in titles by year",
)
fig.update_layout(
    xaxis_title="Publication year",
    yaxis_title="Number of titles with term",
    legend_title="Term",
)
fig.show()


In [63]:
# N times each cluster appears in titles or abstracts, contain the cluster at least once
ai_title_doc_counts = pd.Series(
    {
        "Ai_Ed":    int(df["has_Ai_Ed_title"].sum()),
        "gen_AI":   int(df["has_gen_AI_title"].sum()),
        "AI_plain": int(df["has_AI_plain_title"].sum()),
    },
    name="n_titles_with_cluster"
).sort_values(ascending=False)

ai_abstract_doc_counts = pd.Series(
    {
        "Ai_Ed":    int(df["has_Ai_Ed_abstract"].sum()),
        "gen_AI":   int(df["has_gen_AI_abstract"].sum()),
        "AI_plain": int(df["has_AI_plain_abstract"].sum()),
    },
    name="n_abstracts_with_cluster"
).sort_values(ascending=False)

ai_title_doc_counts.to_frame(), ai_abstract_doc_counts.to_frame()


(          n_titles_with_cluster
 AI_plain                    802
 gen_AI                      147
 Ai_Ed                         6,
           n_abstracts_with_cluster
 AI_plain                       706
 gen_AI                         202
 Ai_Ed                           80)

In [64]:
# total occurences
ai_title_occ_counts = pd.Series(
    {
        "Ai_Ed":    int(df["count_Ai_Ed_title"].sum()),
        "gen_AI":   int(df["count_gen_AI_title"].sum()),
        "AI_plain": int(df["count_AI_plain_title"].sum()),
    },
    name="n_occurrences_in_titles"
).sort_values(ascending=False)

ai_abstract_occ_counts = pd.Series(
    {
        "Ai_Ed":    int(df["count_Ai_Ed_abstract"].sum()),
        "gen_AI":   int(df["count_gen_AI_abstract"].sum()),
        "AI_plain": int(df["count_AI_plain_abstract"].sum()),
    },
    name="n_occurrences_in_abstracts"
).sort_values(ascending=False)
ai_title_occ_counts.to_frame(), ai_abstract_occ_counts.to_frame()

(          n_occurrences_in_titles
 AI_plain                      925
 gen_AI                        161
 Ai_Ed                           6,
           n_occurrences_in_abstracts
 AI_plain                        6305
 gen_AI                           855
 Ai_Ed                            229)

In [65]:
for cluster in ai_clusters:
    base = cluster
    df[f"has_{cluster}_either"] = (
        (df[f"has_{base}_title"] == 1) |
        (df[f"has_{base}_abstract"] == 1)
    ).astype(int)

ai_year_title_counts = (
    df
    .groupby(COL_YEAR)[[f"has_{c}_title" for c in ai_clusters]]
    .sum()
    .rename_axis("year")
    .reset_index()
)

ai_year_abstract_counts = (
    df
    .groupby(COL_YEAR)[[f"has_{c}_abstract" for c in ai_clusters]]
    .sum()
    .rename_axis("year")
    .reset_index()
)

ai_year_either_counts = (
    df
    .groupby(COL_YEAR)[[f"has_{c}_either" for c in ai_clusters]]
    .sum()
    .rename_axis("year")
    .reset_index()
)


In [66]:
ai_year_either_long = (
    ai_year_either_counts
    .melt(id_vars="year", var_name="cluster", value_name="n_papers")
)

ai_year_either_long["cluster"] = (
    ai_year_either_long["cluster"]
    .str.replace(r"^has_", "", regex=True)
    .str.replace(r"_either$", "", regex=True)
)

fig = px.line(
    ai_year_either_long,
    x="year",
    y="n_papers",
    color="cluster",
    markers=True,
    title="AI cluster frequency by year (title OR abstract)",
)
fig.update_layout(
    xaxis_title="Publication year",
    yaxis_title="Number of papers",
    legend_title="AI cluster",
)
fig.show()


### TF-IDF

In [67]:
for k in [1000, 3000, 5000, 10000]:
    vec = TfidfVectorizer(max_features=k, stop_words="english")
    X = vec.fit_transform(df["abstract_clean"].fillna(""))
    print(k, X.shape)

1000 (996, 1000)
3000 (996, 3000)
5000 (996, 5000)
10000 (996, 7535)


In [68]:
def build_tfidf(df: pd.DataFrame, text_col: str, max_features: int = 5000):
    """
    TF-IDF matrix for a given text column.
    Returns: vectorizer, tfidf_matrix
    """
    corpus = df[text_col].fillna("").tolist()
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        stop_words="english",  # change for non-English
    )
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return vectorizer, tfidf_matrix


def print_top_terms(vectorizer, tfidf_matrix, n_terms=20, label=""):
    feature_names = vectorizer.get_feature_names_out()
    mean_scores = tfidf_matrix.mean(axis=0).A1  # average across documents
    top_indices = np.argsort(mean_scores)[::-1][:n_terms]

    print(f"\nTop {n_terms} TF-IDF terms for {label}:")
    for idx in top_indices:
        print(f"{feature_names[idx]:20s}  {mean_scores[idx]:.4f}")


def keyword_summary(df: pd.DataFrame, keywords_dict: dict):
    """
    Frequency summaries for keyword indicators.
    """
    print("\n=== Keyword summaries ===")
    for label in keywords_dict.keys():
        for field in ["title", "abstract"]:
            col = f"has_{label}_{field}"
            if col in df.columns:
                freq = df[col].mean()
                print(f"{col:30s}  proportion={freq:.3f}")


def year_keyword_table(df: pd.DataFrame, label: str):
    col = f"has_{label}_abstract"
    if col not in df.columns:
        print(f"\nColumn {col} not found")
        return

    print(f"\n=== Year x {col} ===")
    print(pd.crosstab(df[COL_YEAR], df[col]))

vec_title, tfidf_title = build_tfidf(df, "title_clean", max_features=3000)
print_top_terms(vec_title, tfidf_title, n_terms=30, label="titles")


vec_abs, tfidf_abs = build_tfidf(df, "abstract_clean", max_features=5000)
print_top_terms(vec_abs, tfidf_abs, n_terms=40, label="abstracts")


Top 30 TF-IDF terms for titles:
review                0.0864
systematic            0.0810
artificial            0.0808
intelligence          0.0807
education             0.0803
ai                    0.0651
literature            0.0586
learning              0.0522
scoping               0.0382
higher                0.0361
generative            0.0356
analysis              0.0286
research              0.0239
meta                  0.0235
students              0.0230
language              0.0227
impact                0.0207
teaching              0.0207
use                   0.0205
based                 0.0204
applications          0.0194
challenges            0.0190
educational           0.0187
future                0.0158
medical               0.0150
literacy              0.0148
integration           0.0128
role                  0.0126
trends                0.0126
student               0.0122

Top 40 TF-IDF terms for abstracts:
ai                    0.1080
learning              0.0586
edu

In [69]:
keyword_summary(df, KEYWORDS)


=== Keyword summaries ===
has_systematic_review_title     proportion=0.653
has_systematic_review_abstract  proportion=0.687
has_scoping_title               proportion=0.154
has_scoping_abstract            proportion=0.142
has_survey_title                proportion=0.005
has_survey_abstract             proportion=0.032
has_overview_title              proportion=0.002
has_overview_abstract           proportion=0.073
has_summary_title               proportion=0.000
has_summary_abstract            proportion=0.011
has_meta_analysis_title         proportion=0.087
has_meta_analysis_abstract      proportion=0.108


In [70]:
year_keyword_table(df, "systematic_review")


=== Year x has_systematic_review_abstract ===
has_systematic_review_abstract    0    1
Publication Year                        
1990.0                            1    0
2016.0                            1    0
2019.0                            1    2
2020.0                            2    4
2021.0                            6   12
2022.0                           22   29
2023.0                           39   71
2024.0                           94  191
2025.0                          135  366
2026.0                            1    7


In [71]:
import plotly.express as px
import plotly.graph_objects as go

def plot_keyword_proportions(df: pd.DataFrame, keywords_dict: dict):
    """
    Grouped bar chart: proportion of articles that contain each keyword group
    in title vs abstract.
    """
    rows = []

    for label in keywords_dict.keys():
        for field in ["title", "abstract"]:
            col = f"has_{label}_{field}"
            if col in df.columns:
                proportion = df[col].mean()  # mean of 0/1 = proportion
                rows.append(
                    {
                        "keyword_group": label,
                        "field": field,
                        "proportion": proportion,
                    }
                )

    plot_df = pd.DataFrame(rows)
    if plot_df.empty:
        print("No keyword columns found to plot.")
        return

    fig = px.bar(
        plot_df,
        x="keyword_group",
        y="proportion",
        color="field",
        barmode="group",
        title="Proportion of articles containing each keyword group\n(in title vs abstract)",
    )
    fig.update_layout(
        xaxis_title="Keyword group",
        yaxis_title="Proportion of articles",
        legend_title="Field",
    )
    fig.show()


plot_keyword_proportions(df, KEYWORDS)

In [28]:
# build tokens from keyword phrases
def extract_keyword_tokens(keywords_dict):
    """
    Take KEYWORDS and return a set of individual lowercase tokens.
    """
    tokens = set()
    for phrases in keywords_dict.values():
        for phrase in phrases:
            # basic tokenization: lowercase, split on non-word chars
            phrase = phrase.lower()
            for tok in re.split(r"\W+", phrase):
                if tok:
                    tokens.add(tok)
    return tokens

keyword_tokens = extract_keyword_tokens(KEYWORDS)
keyword_tokens


{'analysis', 'meta', 'overview', 'scoping', 'summary', 'survey', 'systematic'}

In [29]:
def top_terms_excluding_keywords(vectorizer, tfidf_matrix, keyword_tokens, n_terms=30):
    """
    Get top terms by mean TF-IDF, excluding tokens that are part
    of manual keyword list, and compute document frequency.

    Returns a DataFrame with:
        term        - the token
        doc_freq    - in how many documents the term appears
        mean_tfidf  - average TF-IDF score across all documents
    """
    feature_names = np.array(vectorizer.get_feature_names_out())

    mean_scores = tfidf_matrix.mean(axis=0).A1  # shape (n_terms,) : mean TF-IDF across documents for each term

    term_present = (tfidf_matrix > 0).astype(int) # document frequency
    doc_freq = np.asarray(term_present.sum(axis=0)).ravel()  # shape (n_terms,)

    mask = np.array([tok not in keyword_tokens for tok in feature_names]) # exculde keywords from dictionary

    feature_names_f = feature_names[mask]
    mean_scores_f = mean_scores[mask]
    doc_freq_f = doc_freq[mask]

    top_idx = np.argsort(mean_scores_f)[::-1][:n_terms] # sort desc

    return pd.DataFrame({
        "term": feature_names_f[top_idx],
        "doc_freq": doc_freq_f[top_idx],
        "mean_tfidf": mean_scores_f[top_idx],
    })

top_abstract_terms = top_terms_excluding_keywords(
    vec_abs, tfidf_abs, keyword_tokens, n_terms=40
)
top_title_terms = top_terms_excluding_keywords(
    vec_title, tfidf_title, keyword_tokens, n_terms=40
)

print("=== Abstracts: top terms (excluding manual keywords) ===")
print(top_abstract_terms.head(20))

print("\n=== Titles: top terms (excluding manual keywords) ===")
print(top_title_terms.head(20))



=== Abstracts: top terms (excluding manual keywords) ===
            term  doc_freq  mean_tfidf
0             ai       831    0.112157
1      education       809    0.059538
2          study       818    0.051391
3         review       879    0.045142
4       research       721    0.044785
5       learning       628    0.044265
6        student       560    0.039637
7          learn       453    0.031935
8    educational       518    0.031520
9          genai        87    0.031461
10    technology       483    0.030350
11    artificial       793    0.027855
12  intelligence       797    0.027854
13          tool       387    0.027263
14      language       223    0.026273
15   application       421    0.025868
16          high       347    0.025453
17           use       393    0.024973
18    literature       514    0.024406
19       include       496    0.024279

=== Titles: top terms (excluding manual keywords) ===
            term  doc_freq  mean_tfidf
0         review       902    

In [30]:
def niche_terms(tfidf_matrix, vectorizer, min_docs=3, max_docs=20, top_n=30):
    feature_names = np.array(vectorizer.get_feature_names_out())
    term_present = (tfidf_matrix > 0).astype(int)
    doc_freq = np.array(term_present.sum(axis=0)).ravel()

    mean_scores = tfidf_matrix.mean(axis=0).A1

    mask = (doc_freq >= min_docs) & (doc_freq <= max_docs)

    feature_names_f = feature_names[mask]
    mean_scores_f = mean_scores[mask]
    doc_freq_f = doc_freq[mask]

    top_idx = np.argsort(mean_scores_f)[::-1][:top_n]

    return pd.DataFrame({
        "term": feature_names_f[top_idx],
        "doc_freq": doc_freq_f[top_idx],
        "mean_tfidf": mean_scores_f[top_idx],
    })

niche_terms_abs = niche_terms(tfidf_abs, vec_abs, min_docs=3, max_docs=20, top_n=30)
niche_terms_abs.head(20)


Unnamed: 0,term,doc_freq,mean_tfidf
0,srl,13,0.005565
1,disability,19,0.004909
2,radiology,14,0.004514
3,gamification,18,0.004333
4,financial,15,0.004315
5,declare,9,0.004226
6,surgical,16,0.004171
7,dental,13,0.004046
8,hcai,9,0.003757
9,gen,8,0.0036


In [31]:
def find_similar_articles(df, tfidf_matrix, key_value, top_n=10):
    """
    Given an article Key, find the most similar articles based on tfidf_matrix.
    Returns a DataFrame with similarity scores and keyword info.
    """
    # index of the target article
    idx = df.index[df["Key"] == key_value]
    if len(idx) == 0:
        raise ValueError(f"Key {key_value} not found.")
    idx = idx[0]

    sims = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten() # cosine similarity of this doc to all others

    order = sims.argsort()[::-1] # sort by similarity, drop itself
    order = [i for i in order if i != idx][:top_n]

    rows = []
    target_keywords = set(df.loc[idx, "keyword_groups_any_list"]) if "keyword_groups_any_list" in df.columns else set()

    for j in order:
        row = df.loc[j]
        kws = set(row.get("keyword_groups_any_list", []))
        shared_kws = target_keywords.intersection(kws) if target_keywords else set()

        rows.append({
            "Key": row["Key"],
            "Title": row["Title"],
            "similarity": sims[j],
            "keyword_groups_any_list": list(kws),
            "shared_keyword_groups": list(shared_kws),
        })

    return pd.DataFrame(rows)

example_key = df["Key"].iloc[0]

similar_df = find_similar_articles(df, tfidf_abs, example_key, top_n=10)
similar_df


Unnamed: 0,Key,Title,similarity,keyword_groups_any_list,shared_keyword_groups
0,DEJX8R4V,Unveiling AI literacy in K-12 education: a sys...,0.46567,[systematic_review],[systematic_review]
1,9MC974LD,Towards an AI-Literate Future: A Systematic Li...,0.439149,[systematic_review],[systematic_review]
2,ABL8SVZQ,A scoping review of empirical research on AI l...,0.435394,"[survey, scoping]",[]
3,7RSJ4M78,Artificial Intelligence Literacy Education: A ...,0.387857,[scoping],[]
4,YPTFEJTJ,A Scoping Review of AI Literacy for Teachers,0.383417,"[scoping, systematic_review]",[systematic_review]
5,FN3MBP9H,Assessing AI Literacy: A Systematic Review of ...,0.3497,[systematic_review],[systematic_review]
6,KBZMXJEU,Learning About AI: A Systematic Review of Revi...,0.337428,[systematic_review],[systematic_review]
7,AEFU487Z,AI Literacy in Adult Education - A Literature ...,0.329038,[systematic_review],[systematic_review]
8,CKTWGQAP,Toward Agency-Centered AI Literacy: A Scoping ...,0.326305,[scoping],[]
9,AJE7AGBW,AI literacy guidelines and policies for academ...,0.309531,[scoping],[]


In [32]:
from sklearn.cluster import KMeans

def cluster_articles(tfidf_matrix, n_clusters=8, random_state=42):
    km = KMeans(
        n_clusters=n_clusters,
        random_state=random_state,
        n_init="auto"
    )
    labels = km.fit_predict(tfidf_matrix)
    return km, labels

k = 8
kmeans_abs, cluster_labels = cluster_articles(tfidf_abs, n_clusters=k)

#df["cluster_abs"] = cluster_labels
#df["cluster_abs"].value_counts().sort_index()

def top_terms_per_cluster(kmeans, tfidf_matrix, vectorizer, n_terms=15):
    feature_names = np.array(vectorizer.get_feature_names_out())
    centers = kmeans.cluster_centers_  # (n_clusters, n_features)

    cluster_terms = {}
    for cluster_id, center in enumerate(centers):
        top_idx = np.argsort(center)[::-1][:n_terms]
        cluster_terms[cluster_id] = feature_names[top_idx].tolist()
    return cluster_terms

cluster_top_terms = top_terms_per_cluster(kmeans_abs, tfidf_abs, vec_abs, n_terms=15)

for cid, terms in cluster_top_terms.items():
    print(f"\nCluster {cid}:")
    print(", ".join(terms))



Cluster 0:
adaptive, question, item, ct, estimation, aqg, model, difficulty, test, dance, learning, cognitive, new, effect, computerize

Cluster 1:
genai, education, generative, study, tool, research, review, student, learn, use, thinking, high, learning, impact, skill

Cluster 2:
literacy, ai, education, review, research, study, assessment, teacher, scale, professional, student, competency, knowledge, base, ethic

Cluster 3:
ai, ethical, education, learning, challenge, educational, review, study, technology, research, student, privacy, potential, integration, high

Cluster 4:
study, ai, education, student, gai, learning, research, review, artificial, intelligence, effect, learn, educational, technology, analysis

Cluster 5:
language, ai, writing, chatgpt, efl, study, learning, tool, english, feedback, research, learner, student, review, education

Cluster 6:
ai, education, study, research, review, learning, student, medical, application, learn, educational, technology, teacher, use, 

### TF-IDF on genAI keywords

In [74]:
# Masks: any mention in title OR abstract
mask_aied     = ((df["has_Ai_Ed_title"] == 1) | (df["has_Ai_Ed_abstract"] == 1)).to_numpy()
mask_gen_ai   = ((df["has_gen_AI_title"] == 1) | (df["has_gen_AI_abstract"] == 1)).to_numpy()
mask_ai_plain = ((df["has_AI_plain_title"] == 1) | (df["has_AI_plain_abstract"] == 1)).to_numpy()

subset_mats = {
    "Ai_Ed":    tfidf_abs[mask_aied],
    "gen_AI":   tfidf_abs[mask_gen_ai],
    "AI_plain": tfidf_abs[mask_ai_plain],
}

group_means = {name: np.asarray(mat.mean(axis=0)) for name, mat in subset_mats.items()}

# Cosine similarity between the mean vectors
clusters = list(group_means.keys())
for i, a in enumerate(clusters):
    for b in clusters[i+1:]:
        sim = cosine_similarity(group_means[a], group_means[b])[0, 0]
        print(f"Cosine similarity between {a} and {b}: {sim:.3f}")

Cosine similarity between Ai_Ed and gen_AI: 0.651
Cosine similarity between Ai_Ed and AI_plain: 0.828
Cosine similarity between gen_AI and AI_plain: 0.781


In [75]:
feature_names = np.array(vec_abs.get_feature_names_out())

def top_terms_from_mean(mean_vec, feature_names, n=20):
    arr = np.asarray(mean_vec).ravel()
    top_idx = np.argsort(arr)[::-1][:n]
    return list(zip(feature_names[top_idx], arr[top_idx]))

for name, mean_vec in group_means.items():
    print(f"\nTop terms for {name}:")
    for term, weight in top_terms_from_mean(mean_vec, feature_names, n=20):
        print(f"{term:20s} {weight:.4f}")



Top terms for Ai_Ed:
ai                   0.1409
aied                 0.1352
education            0.0723
learning             0.0569
research             0.0522
educational          0.0380
knowledge            0.0362
students             0.0339
study                0.0339
literacy             0.0328
review               0.0317
teaching             0.0313
studies              0.0282
effect               0.0275
ethical              0.0272
reviews              0.0260
artificial           0.0260
intelligence         0.0259
analysis             0.0258
systematic           0.0254

Top terms for gen_AI:
genai                0.1479
generative           0.0817
ai                   0.0764
education            0.0562
learning             0.0537
gai                  0.0526
research             0.0432
tools                0.0375
review               0.0352
study                0.0336
higher               0.0335
studies              0.0323
students             0.0307
language             0.0280
int

### extra exclusion terms

In [76]:
def tokens_from_keywords(keyword_dict):
    tokens = set()
    for phrases in keyword_dict.values():
        for phrase in phrases:
            for tok in phrase.lower().split():
                tokens.add(tok)
    return tokens

review_tokens = tokens_from_keywords(KEYWORDS)
ai_tokens     = tokens_from_keywords(AI_KEYWORDS)
extra_tokens = {
    "literacy",
    "school",
    "learner",
    "learning",
    "outcome",
    "outcomes",
    "education",
    "educational",
}

EXCLUDED_TOKENS = review_tokens | ai_tokens | extra_tokens

EXCLUDED_PREFIXES = ("educat", "student")


In [77]:
def make_excluding_tokenizer(excluded_tokens, excluded_prefixes=()):
    excluded_tokens = {t.lower() for t in excluded_tokens}
    excluded_prefixes = tuple(p.lower() for p in excluded_prefixes)

    def tokenizer(doc: str):
        toks = (doc or "").lower().split()
        kept = []
        for t in toks:
            if t in excluded_tokens:
                continue
            if any(t.startswith(pref) for pref in excluded_prefixes):
                continue
            kept.append(t)
        return kept

    return tokenizer


def build_tfidf_excluding(df: pd.DataFrame, text_col: str, max_features: int = 5000):
    corpus = df[text_col].fillna("").tolist()
    tokenizer = make_excluding_tokenizer(EXCLUDED_TOKENS, EXCLUDED_PREFIXES)

    vec = TfidfVectorizer(
        max_features=max_features,
        tokenizer=tokenizer,
        preprocessor=None,
        lowercase=True,
        stop_words="english",  # keep standard English stopwords too
    )
    tfidf = vec.fit_transform(corpus)
    return vec, tfidf


In [78]:
vec_title_excl, tfidf_title_excl = build_tfidf_excluding(df, "title_clean", max_features=3000)
vec_abs_excl,   tfidf_abs_excl   = build_tfidf_excluding(df, "abstract_clean", max_features=5000)

print_top_terms(vec_title_excl, tfidf_title_excl, n_terms=30, label="titles (excluded)")
print_top_terms(vec_abs_excl,   tfidf_abs_excl,   n_terms=40, label="abstracts (excluded)")



The parameter 'token_pattern' will not be used since 'tokenizer' is not None'




Top 30 TF-IDF terms for titles (excluded):
review                0.0959
literature            0.0642
higher                0.0396
research              0.0258
language              0.0245
impact                0.0239
use                   0.0226
teaching              0.0221
based                 0.0218
applications          0.0208
k                     0.0201
challenges            0.0200
future                0.0166
medical               0.0163
integration           0.0138
trends                0.0133
role                  0.0133
exploring             0.0127
academic              0.0125
tools                 0.0123
chatbots              0.0121
nursing               0.0112
using                 0.0111
personalized          0.0110
science               0.0109
technologies          0.0108
development           0.0106
english               0.0105
driven                0.0104
bibliometric          0.0103

Top 40 TF-IDF terms for abstracts (excluded):
research              0.0439
review    

In [79]:
k = 8
kmeans_abs_excl, cluster_labels_excl = cluster_articles(tfidf_abs_excl, n_clusters=k)

cluster_top_terms_excl = top_terms_per_cluster(
    kmeans_abs_excl, tfidf_abs_excl, vec_abs_excl, n_terms=15
)

for cid, terms in cluster_top_terms_excl.items():
    print(f"\nCluster {cid} (keywords excluded):")
    print(", ".join(terms))



Cluster 0 (keywords excluded):
gamification, adaptive, emotional, support, service, g, e, based, cognitive, new, research, techniques, effect, item, used

Cluster 1 (keywords excluded):
research, review, gai, study, literature, tools, studies, chatgpt, critical, thinking, s, findings, skills, future, use

Cluster 2 (keywords excluded):
teachers, k, knowledge, research, ethics, teaching, teacher, professional, development, review, studies, literature, study, practices, future

Cluster 3 (keywords excluded):
ethical, challenges, review, research, data, personalized, higher, potential, privacy, concerns, technologies, study, integration, systems, benefits

Cluster 4 (keywords excluded):
research, use, based, science, studies, review, financial, tools, study, work, assessment, relationships, computer, authors, human

Cluster 5 (keywords excluded):
language, writing, feedback, efl, english, chatbots, tools, research, l, studies, learners, speaking, review, skills, foreign

Cluster 6 (keywo

clusters within the 3 clusters with excluded terms

In [84]:
vec_abs, tfidf_abs = build_tfidf(df, "abstract_clean", max_features=5000)
vec_abs_excl, tfidf_abs_excl = build_tfidf_excluding(df, "abstract_clean", max_features=5000)
for cluster in ["Ai_Ed", "gen_AI", "AI_plain"]:
    df[f"has_{cluster}_either"] = (
        (df[f"has_{cluster}_title"] == 1) |
        (df[f"has_{cluster}_abstract"] == 1)
    ).astype(int)

ai_masks = {
    "Ai_Ed":    df["has_Ai_Ed_either"] == 1,
    "gen_AI":   df["has_gen_AI_either"] == 1,
    "AI_plain": df["has_AI_plain_either"] == 1,
}

print({k: int(m.sum()) for k, m in ai_masks.items()})

X_full = tfidf_abs_excl
vec_full = vec_abs_excl

ai_cluster_k = {
    "Ai_Ed": 3,
    "gen_AI": 5,
    "AI_plain": 10,
}

def cluster_subset(X, n_clusters=5, random_state=42):
    km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    labels = km.fit_predict(X)
    return km, labels

def top_terms_per_cluster_local(kmeans_model, vectorizer, n_terms=15):
    feature_names = np.array(vectorizer.get_feature_names_out())
    centroids = kmeans_model.cluster_centers_
    out = {}
    for cid, centroid in enumerate(centroids):
        top_idx = centroid.argsort()[::-1][:n_terms]
        out[cid] = list(feature_names[top_idx])
    return out


for group_name, mask in ai_masks.items():
    idx = np.where(mask)[0]
    n_docs = len(idx)
    k = ai_cluster_k[group_name]

    print(f"\n=== {group_name} ===")
    print(f"n_docs in this group: {n_docs}")
    print(f"Using k = {k}")

    X_sub = X_full[idx, :]

    km, labels = cluster_subset(X_sub, n_clusters=k, random_state=42)

    col_name = f"cluster_abs_{group_name}"
    df.loc[mask, col_name] = labels

    top_terms = top_terms_per_cluster_local(km, vec_full, n_terms=15)

    for cid in range(k):
        size = int((labels == cid).sum())
        print(f"\n  Cluster {cid} (n = {size}):")
        print("   " + ", ".join(top_terms[cid]))




The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



{'Ai_Ed': 80, 'gen_AI': 206, 'AI_plain': 840}

=== Ai_Ed ===
n_docs in this group: 80
Using k = 3

  Cluster 0 (n = 26):
   research, reviews, field, higher, theme, review, study, ethical, literature, results, s, applications, hotspots, articles, systems

  Cluster 1 (n = 45):
   k, knowledge, teaching, research, studies, medical, study, content, review, curriculum, radiology, programs, design, curricula, future

  Cluster 2 (n = 9):
   effect, hcai, risk, hot, achievement, size, indicators, skills, governance, variables, pr, mc, mr, psr, mp

=== gen_AI ===
n_docs in this group: 206
Using k = 5

  Cluster 0 (n = 45):
   design, feedback, studies, assessment, research, review, chatgpt, science, teachers, study, empirical, practices, knowledge, s, teacher

  Cluster 1 (n = 60):
   language, tools, research, review, studies, study, impact, literature, writing, use, teaching, efl, integration, potential, based

  Cluster 2 (n = 17):
   training, medical, research, theme, ethical, themes, h

In [85]:
print("TF-IDF shape (X_full):", X_full.shape)
print("Using excluded terms:", "excl" in repr(vec_full))

for name, mask in ai_masks.items():
    print(name, "docs:", int(mask.sum()))


TF-IDF shape (X_full): (996, 5000)
Using excluded terms: True
Ai_Ed docs: 80
gen_AI docs: 206
AI_plain docs: 840


### Latent Dirichlet Allocation

In [33]:
abstracts = df["abstract_clean"].fillna("")

count_vectorizer = CountVectorizer( #bag of words
    max_df=0.95,       # ignore terms in >95% of documents
    min_df=5,          # ignore terms in <5 documents
    stop_words="english"
)

doc_term_matrix = count_vectorizer.fit_transform(abstracts)
feature_names = np.array(count_vectorizer.get_feature_names_out())

doc_term_matrix.shape


(996, 1940)

In [34]:
n_topics = 8

lda = LatentDirichletAllocation(
    n_components=n_topics,
    learning_method="batch",
    random_state=42,
    max_iter=20,
)

lda.fit(doc_term_matrix)


In [35]:
# larger value in topic vector = more important term in that topic
def print_top_words_per_topic(model, feature_names, n_top_words=15):
    for topic_idx, topic in enumerate(
        model.components_):
        top_idx = topic.argsort()[::-1][:n_top_words]
        terms = feature_names[top_idx]
        weights = topic[top_idx]
        print(f"\nTopic {topic_idx}:")
        print(", ".join(terms))

print_top_words_per_topic(lda, feature_names, n_top_words=15)



Topic 0:
ai, learning, review, study, tool, education, language, student, research, learn, finding, enhance, learner, technology, support

Topic 1:
ai, education, research, review, study, literacy, ethical, technology, literature, teacher, systematic, educational, future, development, artificial

Topic 2:
datum, emotional, network, emotion, support, model, recognition, base, detection, child, learn, neural, time, cognitive, performance

Topic 3:
education, artificial, intelligence, research, learning, ai, student, educational, review, study, technology, learn, result, paper, literature

Topic 4:
genai, generative, author, work, research, practice, relationship, education, use, declare, financial, tool, assessment, potential, impact

Topic 5:
study, research, effect, student, gai, learn, education, analysis, educational, generative, aied, impact, high, learning, meta

Topic 6:
ai, study, student, analysis, base, language, effect, meta, education, include, nursing, review, outcome, resu

In [36]:
# topic distribution for each document
doc_topic_dist = lda.transform(doc_term_matrix)   # shape: (n_docs, n_topics)

df["lda_main_topic"] = doc_topic_dist.argmax(axis=1)
df["lda_main_topic_prob"] = doc_topic_dist.max(axis=1)

for t in range(n_topics):
    df[f"lda_topic_{t}"] = doc_topic_dist[:, t]

df[["Key", "Title", "lda_main_topic", "lda_main_topic_prob"]].head()


Unnamed: 0,Key,Title,lda_main_topic,lda_main_topic_prob
0,RIMXHGKB,A systematic review of AI literacy conceptuali...,1,0.993259
1,BKWIU8LW,Artificial intelligence in learning and develo...,3,0.372807
2,JPMFG28C,Artificial intelligence in education: A system...,1,0.456937
3,768ETFLW,The use of Artificial intelligence in school s...,3,0.510099
4,MJDGH7HH,Artificial intelligence in mathematics educati...,6,0.450356


In [37]:
topic_counts = df["lda_main_topic"].value_counts().sort_index()
topic_counts


Unnamed: 0_level_0,count
lda_main_topic,Unnamed: 1_level_1
0,283
1,256
2,4
3,147
4,15
5,48
6,79
7,164


In [38]:
def find_similar_by_topic(df, doc_topic_dist, key_value, top_n=10):
    idx = df.index[df["Key"] == key_value]
    if len(idx) == 0:
        raise ValueError(f"Key {key_value} not found.")
    idx = idx[0]

    sims = cosine_similarity(doc_topic_dist[idx].reshape(1, -1),
                             doc_topic_dist).flatten()

    order = sims.argsort()[::-1]
    order = [i for i in order if i != idx][:top_n]

    rows = []
    target_kws = set(df.loc[idx, "keyword_groups_any_list"]) if "keyword_groups_any_list" in df.columns else set()

    for j in order:
        row = df.loc[j]
        kws = set(row.get("keyword_groups_any_list", []))
        shared_kws = target_kws.intersection(kws)

        rows.append({
            "Key": row["Key"],
            "Title": row["Title"],
            "lda_main_topic": row["lda_main_topic"],
            "similarity_on_topics": sims[j],
            "keyword_groups_any_list": list(kws),
            "shared_keyword_groups": list(shared_kws),
        })

    return pd.DataFrame(rows)

example_key = df["Key"].iloc[0]
similar_on_topics = find_similar_by_topic(df, doc_topic_dist, example_key, top_n=10)
similar_on_topics


Unnamed: 0,Key,Title,lda_main_topic,similarity_on_topics,keyword_groups_any_list,shared_keyword_groups
0,C4DID9UH,AI Acceptance and Usage in Sub-Saharan African...,1,1.0,[systematic_review],[systematic_review]
1,RS4QG5GS,A Risk Framework for Human-centered Artificial...,1,1.0,"[meta_analysis, systematic_review]",[systematic_review]
2,I8DHLABF,Educational Technology & Society,1,1.0,"[meta_analysis, systematic_review]",[systematic_review]
3,DY2JF7T8,A Risk Framework for Human-centered Artificial...,1,1.0,"[meta_analysis, systematic_review]",[systematic_review]
4,8IICMK2X,The Application Landscape and Research Status ...,1,1.0,"[systematic_review, overview]",[systematic_review]
5,4TFR85B9,K-12 teachers’ ethical competencies for AI lit...,1,1.0,[systematic_review],[systematic_review]
6,GNT8Q78Y,In search of artificial intelligence (AI) lite...,1,1.0,[scoping],[]
7,WLHMQZMM,Ethical Implications and Principles of Using A...,1,1.0,"[systematic_review, overview]",[systematic_review]
8,TLDCGSJV,Critical Artificial Intelligence literacy: A s...,1,1.0,[scoping],[]
9,CKTWGQAP,Toward Agency-Centered AI Literacy: A Scoping ...,1,1.0,[scoping],[]


### AI Clusters

In [39]:
### AI Clusters