In [None]:
#load data from a CSV file using pandas
import pandas as pd

df = pd.read_csv('metadata.csv', low_memory=False)

In [None]:
#explore data
print(df.head())          # Preview first few rows
print(df.shape)           # Dimensions
print(df.info())          # Data types and non-null counts
print(df.isnull().sum())  # Missing values
print(df.describe())      # Stats for numerical columns

In [None]:
df_cleaned = df.dropna(subset=['title', 'abstract', 'publish_time']).copy()
df_cleaned['publish_time'] = pd.to_datetime(df_cleaned['publish_time'], errors='coerce')
df_cleaned['year'] = df_cleaned['publish_time'].dt.year

In [None]:
# Add a new column for the word count of the abstract
df_cleaned.loc[:, 'abstract_word_count'] = df_cleaned['abstract'].apply(lambda x: len(str(x).split()))

In [None]:
#Publications by Year

year_counts = df_cleaned['year'].value_counts().sort_index()
year_counts.plot(kind='bar', title='Publications by Year')

In [None]:
#top journals
top_journals = df_cleaned['journal'].value_counts().head(10)
top_journals.plot(kind='barh', title='Top Publishing Journals')

In [None]:
#Word Frequency in Titles
from collections import Counter
import re

words = ' '.join(df_cleaned['title'].dropna()).lower()
words = re.findall(r'\b\w+\b', words)
common_words = Counter(words).most_common(20)

# Convert to DataFrame for plotting
word_df = pd.DataFrame(common_words, columns=['word', 'count'])
word_df.plot.bar(x='word', y='count', title='Most Frequent Words in Titles')

In [None]:
#Word Cloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = ' '.join(df_cleaned['title'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Paper Titles')
plt.show()

In [None]:
#Distribution by Source
df_cleaned['source_x'].value_counts().plot(kind='pie', autopct='%1.1f%%', title='Distribution by Source')