In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, MinMaxScaler
from scipy.sparse import hstack, csr_matrix
import numpy as np

# Load your data
df = pd.read_excel('/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation/Raw_with_predicted_classification_label.xlsx')

# Fill missing values
df['Authors'] = df['Author'].fillna('')
df['Document_Type'] = df['Document Type'].fillna('Unknown')
df['Publication_Year'] = df['Publication Year'].fillna(df['Publication Year'].median())

# ---------- AUTHORS ----------
# Split authors on "; " and keep top 100 authors to limit sparsity
all_authors = df['Authors'].str.split('; ')
flat_authors = [a for sublist in all_authors for a in sublist if a.strip()]
top_authors = pd.Series(flat_authors).value_counts().nlargest(100).index.tolist()

# Filter and rejoin only top authors
df['Filtered_Authors'] = all_authors.apply(lambda lst: [a for a in lst if a in top_authors])

# Binarize author features
mlb = MultiLabelBinarizer()
author_matrix = mlb.fit_transform(df['Filtered_Authors'])

# ---------- DOCUMENT TYPE ----------
ohe_doc_type = OneHotEncoder(sparse_output=True)
doc_type_matrix = ohe_doc_type.fit_transform(df[['Document_Type']])


# ---------- PUBLICATION YEAR ----------
scaler_year = MinMaxScaler()
year_matrix = scaler_year.fit_transform(df[['Publication_Year']])

# Convert to sparse matrix
year_matrix_sparse = csr_matrix(year_matrix)

# ---------- COMBINE ----------
metadata_matrix = hstack([author_matrix, doc_type_matrix, year_matrix_sparse])


In [6]:
from scipy.sparse import save_npz

save_npz('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/metadata_matrix.npz', metadata_matrix)


In [11]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import load_npz
import numpy as np

# Load the same dataframe (with citation info)
df = pd.read_excel('/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation/Raw_with_predicted_classification_label.xlsx')

# Load feature matrices
tfidf_matrix = load_npz('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/tfidf_matrix.npz')
metadata_matrix = load_npz('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/metadata_matrix.npz')

# Compute cosine similarities
content_similarity = cosine_similarity(tfidf_matrix)
metadata_similarity = cosine_similarity(metadata_matrix)

# Combine: 0.7 * Content + 0.3 * Metadata
final_similarity = 0.7 * content_similarity + 0.3 * metadata_similarity

# Function to recommend top N papers for a given paper index
def recommend_papers_by_index(paper_idx, top_n=10):
    sim_scores = final_similarity[paper_idx]

    # Get top indices excluding self
    top_indices = np.argsort(sim_scores)[::-1]
    top_indices = top_indices[top_indices != paper_idx][:top_n * 5]  # wider selection for filtering

    # Create result dataframe
    results = df.iloc[top_indices][['Article Title', 'Times Cited, All Databases', 'Author', 'Publication Year', 'Document Type','Label']]
    results['Similarity'] = sim_scores[top_indices]

    # Sort by Times Cited DESC (guidance)
    results = results.sort_values(by='Times Cited, All Databases', ascending=False).head(top_n)

    return results.reset_index(drop=True)

# Example: Recommend 10 similar papers to paper at index 42
recommendations = recommend_papers_by_index(42, top_n=10)
print(recommendations)


                                       Article Title  \
0  RNA-seq assistant: machine learning based meth...   
1  Machine Learning-Based Gene Prioritization Ide...   
2  RNA-seq assistant: machine learning based meth...   
3  Predicting Autism Spectrum Disorder Using Bloo...   
4  Machine learning based refined differential ge...   
5  Revealing Alzheimer's disease genes spectrum i...   
6  Machine Learning-Based Identification of Colon...   
7  Integrative machine learning analysis of multi...   
8  Diagnostic genes and immune infiltration analy...   
9  Discovery of gene module acting on ubiquitin- ...   

   Times Cited, All Databases  \
0                        48.0   
1                        48.0   
2                        48.0   
3                        41.0   
4                        38.0   
5                        38.0   
6                        25.0   
7                        23.0   
8                        12.0   
9                        12.0   

                   