In [None]:
!gdown --id 1ExuBzkObUNmqgmeaVMF-OqnAuo9yuozK -O all_capped_keywords.zip

Downloading...
From (original): https://drive.google.com/uc?id=1ExuBzkObUNmqgmeaVMF-OqnAuo9yuozK
From (redirected): https://drive.google.com/uc?id=1ExuBzkObUNmqgmeaVMF-OqnAuo9yuozK&confirm=t&uuid=01aeb256-2397-4eec-bfdf-0452590dca95
To: /content/all_capped_keywords.zip
100% 579M/579M [00:09<00:00, 58.0MB/s]


In [None]:
!unzip all_capped_keywords.zip -d cs_papersum_keywords
!ls -la cs_papersum_keywords/

Archive:  all_capped_keywords.zip
  inflating: cs_papersum_keywords/All_capped_keywords.csv  
total 3000408
drwxr-xr-x 2 root root       4096 Sep 20 13:28 .
drwxr-xr-x 1 root root       4096 Sep 20 13:28 ..
-rw-r--r-- 1 root root 3072402092 Jul 29  2024 All_capped_keywords.csv


In [None]:
import pandas as pd

df = pd.read_csv('cs_papersum_keywords/All_capped_keywords.csv')

print("Dataset shape:", df.shape)
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

print("\nColumn names:")
print(df.columns.tolist())

print("\nFirst 5 rows:")
print(df.head())

print("\nLast 5 rows:")
print(df.tail())

print("\nData types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

print("\nBasic statistics:")
print(df.describe())

Dataset shape: (91919, 25)
Rows: 91919, Columns: 25

Column names:
['paperID', 'venue', 'year', 'openAccessPdf', 'url', 'authors', 'referenceCount', 'title', 'abstract', 'conclusion', 'Chatgpt Response', 'Key Takeaways', 'Importance', 'Model/Method Proposed', 'Performance', 'Effectiveness', 'Future Works', 'Sentiment', 'Sentiment Score', 'combined', 'combined_keywords', 'response_keywords', 'future_work_keywords', 'capped_keywords', 'field']

First 5 rows:
                                    paperID  \
0  000194903cb83bd4af714f950d0266382e2772fc   
1  000351d89ee73f2fa721961129ec9c0758cf20ca   
2  001198d3ef0d304718cfd64ff241d0133adcf928   
3  00133d41d5ecef1a9d046d2b92bb2a23a335cb7c   
4  002285ade41b9e1313f9c914b99e68b62fab7ff1   

                                        venue  year  openAccessPdf  \
0  AAAI Conference on Artificial Intelligence  2021           True   
1  AAAI Conference on Artificial Intelligence  2017          False   
2  AAAI Conference on Artificial Intelligence 

In [None]:
# Important columns for research gap analysis
key_columns = ['title', 'abstract', 'Future Works', 'field', 'year', 'venue']

# Checking unique research fields
print("\nResearch fields distribution:")
print(df['field'].value_counts().head(10))

# Checking year distribution
print("\nYear distribution:")
print(df['year'].value_counts().sort_index().tail(10))

# Checking venues
print("\nTop venues:")
print(df['venue'].value_counts().head(10))


Research fields distribution:
field
Computational Theory                  17214
Computer Vision                       14842
Natural Language Processing           14131
Artificial Intelligence               12084
Human Computer Interaction             7393
Computing in Biomedical Fields         6310
Computer Networks & Communications     3900
Graphics and Computer-Aided Design     3155
Software Engineering                   2482
Computer Security & Cryptography       1341
Name: count, dtype: int64

Year distribution:
year
2017     8120
2018     8704
2019    12642
2020    13130
2021    15491
2022    14851
2023    18461
2024      520
Name: count, dtype: int64

Top venues:
venue
Neural Information Processing Systems                                                     13857
AAAI Conference on Artificial Intelligence                                                10498
Computer Vision and Pattern Recognition                                                    9419
Conference on Empirical Met

In [None]:
# Making a copy
df_clean = df.copy()

# Removing rows with missing abstracts
print("Before cleaning:", len(df_clean))
df_clean = df_clean.dropna(subset=['abstract'])
print("After removing missing abstracts:", len(df_clean))

# Replacing missing values
# print("Before cleaning:", len(df_clean))
# df["authors"] = df_clean["authors"].fillna("Unknown")
# df["field"] = df_clean["field"].fillna("Unknown")
# df["conclusion"] = df_clean["conclusion"].fillna("")
# print("After filling missing values:", len(df_clean))

Before cleaning: 91919
After removing missing abstracts: 87276


In [None]:
# Key columns for research gap analysis
research_gap_columns = [
    'title', 'abstract', 'Future Works', 'field',
    'Model/Method Proposed', 'Performance', 'Effectiveness',
    'future_work_keywords', 'capped_keywords'
]

# Dataset for research gap analysis
gap_analysis_df = df_clean[research_gap_columns].copy()

# Examining future works column
print("\nSample Future Works entries:")
print(df_clean['Future Works'].head(3).values)

print("\nFuture work keywords sample:")
print(df_clean['future_work_keywords'].head(3).values)


Sample Future Works entries:
["['Further enhance interpretability and visualization', 'Explore applications to other domains beyond face anti-spoofing']"
 "['Explore further applications of anticipatory behavior in complex social scenarios.', 'Investigate the impact of different network architectures on agent decision-making.']"
 "['Improvement of the medical ontology system with real-world evidence.', 'Addressing issues related to false-negative predictions.']"]

Future work keywords sample:
["[('face anti spoofing', 0.6095), ('anti spoofing', 0.5719), ('spoofing', 0.5294), ('domains face anti', 0.5041), ('applications domains face', 0.4914), ('applications', 0.4733), ('domains face', 0.4467), ('explore applications', 0.3684), ('enhance interpretability visualization', 0.3626), ('face anti', 0.3612), ('applications domains', 0.3538), ('explore applications domains', 0.3494), ('enhance interpretability', 0.3429), ('interpretability visualization', 0.3329), ('interpretability visualiza

In [None]:
import re

def normalize_text(text):
    if not isinstance(text, str):
        return ""
    # Converting to owercase
    text = text.lower()

    # Removing newlines/tabs
    text = text.replace("\n", " ").replace("\t", " ")

    # Removing extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Removing references
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\([^)]*et al\.,\s*\d{4}\)', '', text)
    return text

df_clean['future_work_cleaned'] = df_clean['Future Works'].apply(normalize_text)
df_clean['abstract_cleaned'] = df_clean['abstract'].apply(normalize_text)
df_clean['method_proposed_cleaned'] = df_clean['Model/Method Proposed'].apply(normalize_text)


In [None]:
# Checking what we have for conclusion
print("Conclusion availability:")
print(f"Total papers: {len(df_clean)}")
print(f"Papers with conclusion: {df_clean['conclusion'].notna().sum()}")
print(f"Papers without conclusion: {df_clean['conclusion'].isna().sum()}")

# Cleaning conclusion where available
df_clean['conclusion_cleaned'] = df_clean['conclusion'].apply(normalize_text)

Conclusion availability:
Total papers: 87276
Papers with conclusion: 39011
Papers without conclusion: 48265


In [None]:
# Combining text sections to make a research gap corpus
def create_gap_corpus(row):
    sections = []

    # Abstract
    if row['abstract_cleaned'] and len(row['abstract_cleaned']) > 20:
        sections.append(f"[ABSTRACT] {row['abstract_cleaned']}")

    # Method/approach
    if row['method_proposed_cleaned'] and len(row['method_proposed_cleaned']) > 20:
        sections.append(f"[METHOD] {row['method_proposed_cleaned']}")

    # Future works
    if row['future_work_cleaned'] and len(row['future_work_cleaned']) > 20:
        sections.append(f"[FUTURE_WORK] {row['future_work_cleaned']}")

    # Conclusion
    if row['conclusion_cleaned'] and len(row['conclusion_cleaned']) > 20:
        sections.append(f"[CONCLUSION] {row['conclusion_cleaned']}")

    return " ".join(sections)

# Creating the gap corpus
df_clean['gap_corpus'] = df_clean.apply(create_gap_corpus, axis=1)

In [None]:
# Checking the quality of gap corpus
print("Gap Corpus Statistics:")
print(f"Papers with gap corpus: {(df_clean['gap_corpus'].str.len() > 0).sum()}")
print(f"Average gap corpus length: {df_clean['gap_corpus'].str.len().mean():.0f} characters")

# Show examples
print("\nSample Gap Corpus Entries")
for i in range(3):
    print(f"\nPaper {i+1}:")
    print(f"Title: {df_clean.iloc[i]['title']}")
    print(f"Field: {df_clean.iloc[i]['field']}")
    print(f"Gap Corpus (first 300 chars): {df_clean.iloc[i]['gap_corpus'][:300]}...")
    print("-" * 100)

Gap Corpus Statistics:
Papers with gap corpus: 87276
Average gap corpus length: 2985 characters

Sample Gap Corpus Entries

Paper 1:
Title: Generalizable Representation Learning for Mixture Domain Face Anti-Spoofing
Field: Artificial Intelligence
Gap Corpus (first 300 chars): [ABSTRACT] face anti-spoofing approach based on domain generalization (dg) has drawn growing attention due to its robustness for unseen scenarios. existing dg methods assume that the domain label is known. however, in real-world applications, the collected dataset always contains mixture domains, wh...
----------------------------------------------------------------------------------------------------

Paper 2:
Title: Coordinating Human and Agent Behavior in Collective-Risk Scenarios
Field: Artificial Intelligence
Gap Corpus (first 300 chars): [ABSTRACT] various social situations entail a collective risk. a well-known example is climate change, wherein the risk of a future environmental disaster clashes with the i

In [None]:
# Filtering papers with substantial gap information
min_corpus_length = 100  # Minimum characters for meaningful analysis

df_gap_ready = df_clean[
    (df_clean['gap_corpus'].str.len() >= min_corpus_length) &
    (df_clean['field'].notna())
].copy()

print(f"Papers ready for gap analysis: {len(df_gap_ready)}")
print(f"Original dataset: {len(df_clean)}")
print(f"Filtered out: {len(df_clean) - len(df_gap_ready)} papers")

Papers ready for gap analysis: 80318
Original dataset: 87276
Filtered out: 6958 papers


In [None]:
# Seeing gap corpus distribution by field
field_counts = df_gap_ready['field'].value_counts().head(10)
print("Top research fields with gap corpus:")
print(field_counts)

# Viewing sample from top field
top_field = field_counts.index[0]
sample_papers = df_gap_ready[df_gap_ready['field'] == top_field].head(2)

print(f"\nSample Gap Corpus from {top_field}")
for idx, row in sample_papers.iterrows():
    print(f"Title: {row['title'][:100]}...")
    print(f"Gap Corpus: {row['gap_corpus'][:400]}...")
    print("-" * 100)

Top research fields with gap corpus:
field
Computational Theory                  16103
Computer Vision                       14786
Natural Language Processing           14007
Artificial Intelligence               11930
Human Computer Interaction             6893
Computing in Biomedical Fields         4579
Computer Networks & Communications     3872
Graphics and Computer-Aided Design     3050
Software Engineering                   2336
Computer Hardware & Architecture       1214
Name: count, dtype: int64

Sample Gap Corpus from Computational Theory
Title: Bandit algorithms: Letting go of logarithmic regret for statistical robustness...
Gap Corpus: [ABSTRACT] we study regret minimization in a stochastic multi-armed bandit setting and establish a fundamental trade-off between the regret suffered under an algorithm, and its statistical robustness. considering broad classes of underlying arms' distributions, we show that bandit learning algorithms with logarithmic regret are always inconsis

In [None]:
# Analyzing future work patterns
from collections import Counter
import re

# Combining all future works text
all_future_works = df_clean['future_work_cleaned'].fillna('')
future_text = ' '.join(all_future_works)

# Some common research gap indicators
gap_indicators = [
    'improve', 'enhance', 'extend', 'future work', 'limitation',
    'challenge', 'unexplored', 'further research', 'investigate',
    'not addressed', 'remain', 'need', 'require'
]

# Counting appearances of each indicator and storing in dictionary)
gap_counts = {}
for indicator in gap_indicators:
    gap_counts[indicator] = future_text.count(indicator)

# Viewing  results in descending order
for word, count in sorted(gap_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"'{word}' appears {count} times")

# Grouping by research field to find field-specific gaps
field_gaps = df_clean.groupby('field')['Future Works'].apply(
    lambda x: ' '.join(x.fillna(''))
).to_dict()

# Showing top fields with most papers
top_fields = df_clean['field'].value_counts().head(5)
print("\nTop research fields:")
print(top_fields)

'investigate' appears 14406 times
'enhance' appears 13330 times
'improve' appears 12971 times
'extend' appears 9165 times
'further research' appears 2162 times
'challenge' appears 1163 times
'limitation' appears 816 times
'need' appears 489 times
'require' appears 313 times
'future work' appears 202 times
'remain' appears 70 times
'unexplored' appears 5 times
'not addressed' appears 1 times

Top research fields:
field
Computational Theory           16103
Computer Vision                14786
Natural Language Processing    14007
Artificial Intelligence        11930
Human Computer Interaction      6893
Name: count, dtype: int64


In [None]:
# Some explicit gap-indicating phrases
explicit_gap_phrases = [
    # Direct limitations
    'limitation', 'limited', 'restrict', 'restricted', 'challenge', 'challenging',
    'difficult', 'difficulty', 'problem', 'issue', 'drawback', 'shortcoming',

    # Future work indicators
    'future work', 'future research', 'further research', 'further investigation',
    'future direction', 'next step', 'remain', 'remains', 'still need',

    # Negation patterns
    'not addressed', 'not considered', 'not explored', 'not investigated',
    'lack', 'lacking', 'absent', 'missing', 'incomplete', 'insufficient',

    # Improvement indicators
    'could be improved', 'need improvement', 'should be enhanced',
    'require', 'requires', 'needs', 'demand', 'necessitate'
]

# Searching for these phrases in gap corpus
def extract_gap_sentences(text, gap_phrases):
    sentences = text.split('.')
    gap_sentences = []

    for sentence in sentences:
        sentence = sentence.strip().lower()
        for phrase in gap_phrases:
            if phrase in sentence:
                gap_sentences.append(sentence)
                break

    return gap_sentences

df_gap_ready['gap_sentences'] = df_gap_ready['gap_corpus'].apply(
    lambda x: extract_gap_sentences(x, explicit_gap_phrases)
)

In [None]:
# Looking for specific patterns that indicate research gaps
import re

def extract_contextual_gaps(text):
    gap_contexts = []

    # Pattern 1: "However, [gap description]"
    however_pattern = r'however,\s*([^.]*)'
    however_matches = re.findall(however_pattern, text.lower())
    gap_contexts.extend([f"however_gap: {match}" for match in however_matches])

    # Pattern 2: "Future work [specific direction]"
    future_pattern = r'future work[^.]*([^.]*)'
    future_matches = re.findall(future_pattern, text.lower())
    gap_contexts.extend([f"future_gap: {match}" for match in future_matches])

    # Pattern 3: "Our method fails/struggles when..."
    failure_pattern = r'(fails?|struggles?|cannot|unable)[^.]*'
    failure_matches = re.findall(failure_pattern, text.lower())
    gap_contexts.extend([f"limitation_gap: {match}" for match in failure_matches])

    return gap_contexts

df_gap_ready['contextual_gaps'] = df_gap_ready['gap_corpus'].apply(extract_contextual_gaps)

In [None]:
# Extracting field-specific gap patterns
def get_field_specific_gaps(field_name, corpus_text):

    # AI/ML specific gaps
    ai_gaps = ['overfitting', 'underfitting', 'generalization', 'scalability',
               'interpretability', 'bias', 'fairness', 'robustness']

    # Computer Vision gaps
    cv_gaps = ['occlusion', 'lighting conditions', 'viewpoint variation',
               'real-time processing', 'annotation cost']

    # NLP gaps
    nlp_gaps = ['out-of-vocabulary', 'cross-lingual', 'low-resource',
                'domain adaptation', 'context understanding']

    field_gap_dict = {
        'artificial intelligence': ai_gaps,
        'computer vision': cv_gaps,
        'natural language processing': nlp_gaps
    }

    found_gaps = []
    if field_name.lower() in field_gap_dict:
        for gap_term in field_gap_dict[field_name.lower()]:
            if gap_term in corpus_text.lower():
                found_gaps.append(gap_term)

    return found_gaps

df_gap_ready['domain_gaps'] = df_gap_ready.apply(
    lambda row: get_field_specific_gaps(row['field'], row['gap_corpus']),
    axis=1
)

In [None]:
# Checking extraction results
print("GAP EXTRACTION SUMMARY:")
print(f"Total papers: {len(df_gap_ready)}")
print(f"Papers with gap sentences: {(df_gap_ready['gap_sentences'].str.len() > 0).sum()}")
print(f"Papers with contextual gaps: {(df_gap_ready['contextual_gaps'].str.len() > 0).sum()}")
print(f"Papers with domain gaps: {(df_gap_ready['domain_gaps'].str.len() > 0).sum()}")

# Average gaps per paper
print(f"Average gap sentences per paper: {df_gap_ready['gap_sentences'].str.len().mean():.1f}")
print(f"Average contextual gaps per paper: {df_gap_ready['contextual_gaps'].str.len().mean():.1f}")
print(f"Average domain gaps per paper: {df_gap_ready['domain_gaps'].str.len().mean():.1f}")

GAP EXTRACTION SUMMARY:
Total papers: 80318
Papers with gap sentences: 63537
Papers with contextual gaps: 40220
Papers with domain gaps: 7651
Average gap sentences per paper: 2.8
Average contextual gaps per paper: 1.0
Average domain gaps per paper: 0.1


In [None]:
# Reviewing sample extractions
print("\nSAMPLE GAP EXTRACTIONS:")
for i in range(5):
    paper = df_gap_ready.iloc[i]
    print(f"\n--- Paper {i+1}: {paper['title'][:80]}... ---")
    print(f"Field: {paper['field']}")

    print("Gap Sentences:")
    for sentence in paper['gap_sentences'][:3]:
        print(f"  • {sentence[:100]}...")

    print("Contextual Gaps:")
    for gap in paper['contextual_gaps'][:2]:
        print(f"  • {gap[:100]}...")

    print("Domain Gaps:")
    print(f"  • {paper['domain_gaps']}")
    print("-" * 100)


SAMPLE GAP EXTRACTIONS:

--- Paper 1: Generalizable Representation Learning for Mixture Domain Face Anti-Spoofing... ---
Field: Artificial Intelligence
Gap Sentences:
  • to overcome the limitation, we propose domain dynamic adjustment meta-learning (d$^2$am) without usi...
  • hence, we think it is important to simulate difficult and abundant domain shift scenarios for meta-l...
  • a drlm and mmdbased regularization are designed for better dynamic adjustment to simulate more diffi...
Contextual Gaps:
  • however_gap: in real-world applications, the collected dataset always contains mixture domains, wher...
Domain Gaps:
  • ['generalization', 'interpretability', 'robustness']
----------------------------------------------------------------------------------------------------

--- Paper 2: Coordinating Human and Agent Behavior in Collective-Risk Scenarios... ---
Field: Artificial Intelligence
Gap Sentences:
Contextual Gaps:
  • however_gap: only the ones that used it to predict future

In [None]:
# Checking gaps by research field
gap_by_field = df_gap_ready.groupby('field').agg({
    'gap_sentences': lambda x: sum(len(sentences) for sentences in x),
    'contextual_gaps': lambda x: sum(len(gaps) for gaps in x),
    'domain_gaps': lambda x: sum(len(gaps) for gaps in x)
}).sort_values('gap_sentences', ascending=False)

print("\nGAPS BY RESEARCH FIELD")
print(gap_by_field.head(10))


GAPS BY RESEARCH FIELD
                                    gap_sentences  contextual_gaps  \
field                                                                
Computer Vision                             46465            15990   
Computational Theory                        45513            12776   
Artificial Intelligence                     42882            13583   
Natural Language Processing                 33287            13412   
Human Computer Interaction                  17673             6010   
Computing in Biomedical Fields              11170             3802   
Computer Networks & Communications           9513             3387   
Software Engineering                         8108             3268   
Graphics and Computer-Aided Design           5430             2008   
Computer Hardware & Architecture             3426             1372   

                                    domain_gaps  
field                                            
Computer Vision                    

In [None]:
from collections import Counter

# Collecting all gap sentences
all_gap_sentences = []
for sentences in df_gap_ready['gap_sentences']:
    all_gap_sentences.extend(sentences)

# Collecting all contextual gaps
all_contextual_gaps = []
for gaps in df_gap_ready['contextual_gaps']:
    all_contextual_gaps.extend(gaps)

print(f"\nMOST COMMON GAP PATTERNS")
print(f"Total gap sentences extracted: {len(all_gap_sentences)}")
print(f"Total contextual gaps extracted: {len(all_contextual_gaps)}")

# Sampling the most common patterns
print("\nSample gap sentences:")
for i, sentence in enumerate(all_gap_sentences[:10]):
    print(f"{i+1}. {sentence[:120]}...")


MOST COMMON GAP PATTERNS
Total gap sentences extracted: 228191
Total contextual gaps extracted: 77439

Sample gap sentences:
1. to overcome the limitation, we propose domain dynamic adjustment meta-learning (d$^2$am) without using domain labels, wh...
2. hence, we think it is important to simulate difficult and abundant domain shift scenarios for meta-learning...
3. a drlm and mmdbased regularization are designed for better dynamic adjustment to simulate more difficult and abundant do...
4. traditional preclinical in vitro safety profiling and clinical safety trials are restricted in terms of small scale, lon...
5. ', 'addressing issues related to false-negative predictions...
6. furthermore, we analyze the positive and false negative predictions based on a sample drug and the results point to some...
7. temple gains sample efficiency by extracting similarities of the transition dynamics across tasks even when their underl...
8. perhaps the most difficult of these settings is that of 

In [None]:
# Quality metrics
empty_gaps = (df_gap_ready['gap_sentences'].str.len() == 0) & \
            (df_gap_ready['contextual_gaps'].str.len() == 0) & \
            (df_gap_ready['domain_gaps'].str.len() == 0)

print(f"\nQUALITY METRICS")
print(f"Papers with NO gaps extracted: {empty_gaps.sum()}")
print(f"Papers with gaps extracted: {(~empty_gaps).sum()}")
print(f"Success rate: {(~empty_gaps).mean()*100:.1f}%")

# Papers with rich gap information
rich_papers = (df_gap_ready['gap_sentences'].str.len() > 0) & \
             (df_gap_ready['contextual_gaps'].str.len() > 0)
print(f"Papers with rich gap info: {rich_papers.sum()}")


QUALITY METRICS
Papers with NO gaps extracted: 11530
Papers with gaps extracted: 68788
Success rate: 85.6%
Papers with rich gap info: 35684


In [None]:
# Saving the main dataset
df_gap_ready.to_csv('research_gaps_dataset.csv', index=False)
df_gap_ready.to_pickle('research_gaps_dataset.pkl')

# Saving gap extractions summary for quick reference
gap_summary = {
    'total_papers': len(df_gap_ready),
    'total_gap_sentences': len(all_gap_sentences),
    'total_contextual_gaps': len(all_contextual_gaps),
    'success_rate': (~empty_gaps).mean(),
    'top_fields': gap_by_field.head(5).to_dict()
}

import json
with open('gap_extraction_summary.json', 'w') as f:
    json.dump(gap_summary, f, indent=2)

print("Data saved successfully!")
print("\nFiles created:")
print("\nresearch_gaps_dataset.csv (main dataset)")
print("\nresearch_gaps_dataset.pkl (faster loading)")
print("\ngap_extraction_summary.json (summary stats)")

Data saved successfully!

Files created:

research_gaps_dataset.csv (main dataset)

research_gaps_dataset.pkl (faster loading)

gap_extraction_summary.json (summary stats)
