# Data exploration

In [2]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the data
# You might need to change the path 
dionne_path = "/Users/dionnespaltman/Desktop/Luiss /Data Science in Action/Project/openalex_papers.csv"
df = pd.read_csv(dionne_path)


In [None]:
# Inspect data 
display(df.head())

Unnamed: 0,id,submitter,authors,title,comments,journ_ref,doi,abstract,versions,year,journal,keywords,topics
0,https://openalex.org/W3034272367,,"Ioannis Antonopoulos, Valentin Robu, Benoit Co...",Artificial intelligence and machine learning a...,,,https://doi.org/10.1016/j.rser.2020.109899,Recent years have seen an increasing interest ...,[],2020,Renewable and Sustainable Energy Reviews,[],"['Smart Grid Energy Management', 'Energy Effic..."
1,https://openalex.org/W2141042444,,W. Brian Arthur,The Economy as an Evolving Complex System II,,,https://doi.org/10.1201/9780429496639,"* Introduction W.B. Arthur, S.N., Durlauf, and...",[],2018,CRC Press eBooks,[],"['Complex Systems and Time Series Analysis', '..."
2,https://openalex.org/W789578048,,"Leonel A. Laboissiere, Ricardo A. S. Fernandes...",Maximum and minimum stock price forecasting of...,,,https://doi.org/10.1016/j.asoc.2015.06.005,,[],2015,Applied Soft Computing,"[{'id': 'https://openalex.org/keywords/stock',...","['Stock Market Forecasting Methods', 'Energy L..."
3,https://openalex.org/W4321748146,,"Abhishek Aggarwal, Cheuk Chi Tam, Dezhi Wu, Xi...",Artificial Intelligence–Based Chatbots for Pro...,,,https://doi.org/10.2196/40789,Background Artificial intelligence (AI)–based ...,[],2023,Journal of Medical Internet Research,[{'id': 'https://openalex.org/keywords/chatbot...,"['Digital Mental Health Interventions', 'Mobil..."
4,https://openalex.org/W2944828013,,"Christina A. Roberto, Hannah G. Lawman, Michae...",Association of a Beverage Tax on Sugar-Sweeten...,,,https://doi.org/10.1001/jama.2019.4249,Policy makers have implemented beverage taxes ...,[],2019,JAMA,[{'id': 'https://openalex.org/keywords/fluid-o...,"['Obesity, Physical Activity, Diet', 'Nutritio..."


In [5]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         25 non-null     object 
 1   submitter  0 non-null      float64
 2   authors    24 non-null     object 
 3   title      25 non-null     object 
 4   comments   0 non-null      float64
 5   journ_ref  0 non-null      float64
 6   doi        25 non-null     object 
 7   abstract   24 non-null     object 
 8   versions   25 non-null     object 
 9   year       25 non-null     int64  
 10  journal    25 non-null     object 
 11  keywords   25 non-null     object 
 12  topics     25 non-null     object 
dtypes: float64(3), int64(1), object(9)
memory usage: 2.7+ KB


None

In [6]:
# Get count of missing values
print(df.isnull().sum())

id            0
submitter    25
authors       1
title         0
comments     25
journ_ref    25
doi           0
abstract      1
versions      0
year          0
journal       0
keywords      0
topics        0
dtype: int64


In [7]:
# Fill NA 
df['abstract'] = df['abstract'].fillna("")
df['submitter'] = df['submitter'].fillna("")
df['authors'] = df['authors'].fillna("")
df['comments'] = df['comments'].fillna("")
df['journ_ref'] = df['journ_ref'].fillna("")


In [14]:
# Get count of missing values
print(df.isnull().sum())

id           0
submitter    0
authors      0
title        0
comments     0
journ_ref    0
doi          0
abstract     0
versions     0
year         0
journal      0
keywords     0
topics       0
dtype: int64


I'm not a hundred percent sure if it could be possible that the articles with NA somewhere are now seen to be correlated. 

In [13]:
# Preprocess 
df['abstract'] = df['abstract'].fillna("")

# Create a TF-IDF matrix from abstracts
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['abstract'])

# Compute cosine similarity between articles
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Example: Find top 3 similar articles for the first article
import numpy as np
similar_indices = np.argsort(cosine_sim[0])[::-1][1:4]  # skip self (index 0)
print("Top 3 similar articles for the first article:")
print(df.iloc[similar_indices][['title', 'year']])


Top 3 similar articles for the first article:
                                                title  year
14  Artificial Intelligence Enabled Demand Respons...  2022
9   Retail Electricity Pricing Strategy via an Art...  2020
19  Artificial intelligence in agricultural value ...  2021
