In [41]:
import pandas as pd
import re

# Load the data (adjust the path as needed)
data = pd.read_csv('author_papers_2022_2024_part_1.csv')

# Check the columns of the dataset
print("Columns in the dataset:", data.columns)

# Define keywords and weights


keywords = {#nouse
    'bioremediation': {'weight': 3, 'contexts': ['bioremediation'], 'exceptions': []},
    'battery': {'weight': 3, 'contexts': ['battery'], 'exceptions': []},
    'energy': {'weight': 3, 'contexts': ['energy'], 'exceptions': []},
    'sustainable': {'weight': 3, 'contexts': ['sustainable'], 'exceptions': []},
    'climate': {'weight': 3, 'contexts': ['climate'], 'exceptions': []},
    'pollutant': {'weight': 2, 'contexts': ['pollutant'], 'exceptions': []},
    'material': {'weight': 2, 'contexts': ['material'], 'exceptions': []},
    'carbon': {'weight': 3, 'contexts': ['carbon'], 'exceptions': []},
    'ecosystem': {'weight': 2, 'contexts': ['ecosystem'], 'exceptions': []},
    'aquaculture': {'weight': 2, 'contexts': ['aquaculture'], 'exceptions': []},
    'pollution': {'weight': 2, 'contexts': ['pollution'], 'exceptions': []},
    'circular': {'weight': 2, 'contexts': ['circular'], 'exceptions': []},
    'health': {'weight': 2, 'contexts': ['health'], 'exceptions': []},
    'risk': {'weight': 3, 'contexts': ['risk'], 'exceptions': []},
    'urban': {'weight': 2, 'contexts': ['urban'], 'exceptions': []},
    'biodiversity': {'weight': 3, 'contexts': ['biodiversity'], 'exceptions': []},
}

keywords = {
    'bioremediation': {'weight': 3, 'contexts': ['forests','bioremediation', 'removal of environmental pollutants'], 'exceptions': []},
    'battery': {'weight': 3, 'contexts': ['battery applications', 'alternative fuel vehicles', 'renewable energy storage'], 'exceptions': []},
    'biosynthesis': {'weight': 2, 'contexts': ['biosynthesis', 'longevity vitamin', 'cancer proliferation inhibition'], 'exceptions': []},
    'green materials': {'weight': 3, 'contexts': ['green materials', 'sustainable materials development'], 'exceptions': []},
    'climate change': {'weight': 3, 'contexts': ['climate change', 'climate adaptation', 'climate risk'], 'exceptions': []},
    'social movements': {'weight': 2, 'contexts': ['social movements', 'influence of social movements', '#Metoo Movement'], 'exceptions': []},
    'sustainable development': {'weight': 3, 'contexts': ['sustainable development', 'inclusive international business', 'green recovery'], 'exceptions': []},
    'zoonotic pandemics': {'weight': 3, 'contexts': ['prevention of zoonotic pandemics', 'zoonotic disease prevention'], 'exceptions': []},
    'land use': {'weight': 2, 'contexts': ['land use changes', 'land cover changes', 'sustainable land management'], 'exceptions': []},
    'aquaculture': {'weight': 2, 'contexts': ['cage aquaculture', 'sustainable aquaculture', 'impact on ecosystems'], 'exceptions': []},
    'renewable energy': {'weight': 3, 'contexts': ['renewable energy', 'solar energy', 'wind energy', 'alternative energy storage'], 'exceptions': []},
    'carbon emissions': {'weight': 3, 'contexts': ['carbon emissions', 'carbon footprint', 'carbon sequestration'], 'exceptions': []},
    'urban green space': {'weight': 2, 'contexts': ['urban green space', 'green infrastructure', 'impact on urban temperature'], 'exceptions': []},
    'biodiversity': {'weight': 3, 'contexts': ['biodiversity', 'ecological health', 'species conservation'], 'exceptions': []},
    'pollution': {'weight': 3, 'contexts': ['air pollution', 'water pollution', 'pollution reduction'], 'exceptions': []},
    'waste reduction': {'weight': 2, 'contexts': ['waste reduction', 'circular economy', 'reuse and recycling'], 'exceptions': []},
    'health impacts': {'weight': 2, 'contexts': ['health impacts', 'public health', 'health and climate co-benefits'], 'exceptions': []},
    'migration': {'weight': 2, 'contexts': ['rural-urban migration', 'migration and sustainability'], 'exceptions': []},
    'social equity': {'weight': 3, 'contexts': ['social equity', 'inclusive growth', 'equitable access'], 'exceptions': []},
    'sustainable agriculture': {'weight': 3, 'contexts': ['sustainable agriculture', 'food security', 'agricultural productivity'], 'exceptions': []},
    'renewable infrastructure': {'weight': 3, 'contexts': ['Green','low-carbon infrastructure', 'renewable infrastructure', 'green buildings'], 'exceptions': []}
}

# Classification function
def classify_paper(title, abstract):
    score = 0
    
    # Helper function to calculate keyword score
    def keyword_score(text, weight, contexts, exceptions):
        if not isinstance(text, str):
            return 0
        for keyword in contexts:
            if re.search(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE):
                for exception in exceptions:
                    if re.search(r'\b' + re.escape(exception) + r'\b', text, re.IGNORECASE):
                        return 0
                return weight
        return 0
    
    # Score keywords in title and abstract
    for keyword, info in keywords.items():
        score += keyword_score(title, info['weight'] * 2, info['contexts'], info['exceptions'])
        score += keyword_score(abstract, info['weight'], info['contexts'], info['exceptions'])
    
    # Classify based on score
    if score >= 10:
        return "focus on the sustainability problem"
    elif 1 <= score < 10:
        return "might mention sustainability but not the main focus"
    else:
        return "not sustainability-related"

# Rename columns to match expected names
data.rename(columns={'Title': 'title', 'Abstract': 'abstract'}, inplace=True)

# Check if the necessary columns exist
if 'title' in data.columns and 'abstract' in data.columns:
    # Apply classification to the dataset
    data['classification'] = data.apply(lambda x: classify_paper(x['title'], x['abstract']), axis=1)
    # Save results
    data.to_csv('classified_papers.csv', index=False)
    print("Classification complete. Results saved to 'classified_papers.csv'")
else:
    print("Error: The dataset does not contain the required 'title' and 'abstract' columns.")


Columns in the dataset: Index(['Author', 'Title', 'Abstract', 'Year'], dtype='object')
Classification complete. Results saved to 'classified_papers.csv'


In [28]:
data.value_counts('classification')

classification
not sustainability-related                             239
might mention sustainability but not the main focus     85
focus on the sustainability problem                     13
dtype: int64

In [29]:
test_data = [
    {'Author': 'Alice Smith', 'title': 'sustainability Sustainable Forestry Practices', 'abstract': 'This paper discusses the impact of sustainable forestry on biodiversity and climate change.'},
    {'Author': 'Bob Johnson', 'title': 'Mathematical Inequalities in Modern Analysis', 'abstract': 'The paper presents new approaches to solving mathematical inequalities.'},
    {'Author': 'Carol Lee', 'title': 'Gender Equality and Economic Growth', 'abstract': 'This study explores the relationship between gender equality and economic growth, highlighting potential benefits.'},
    {'Author': 'David Brown', 'title': 'Professional Growth Strategies', 'abstract': 'The paper provides insights into professional growth and career development.'},
    {'Author': 'Eve White', 'title': 'Renewable Energy and Carbon Footprint Reduction', 'abstract': 'An analysis of how renewable energy sources contribute to reducing the carbon footprint and mitigating climate change.'}
]
test_data = pd.read_csv('research_list_last.csv')
test_data.rename(columns={'Research Title':'title','Abstract':'abstract'},inplace=True)

In [42]:
test_data['classification'] = test_data.apply(lambda x: classify_paper(x['title'], x['abstract']), axis=1)

In [43]:
test_data.value_counts('classification')

classification
might mention sustainability but not the main focus    2
focus on the sustainability problem                    1
not sustainability-related                             1
dtype: int64

In [37]:
test_data.dropna(subset=['abstract','title'],inplace=True)