In [1]:
# importing all packages
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time
from tqdm import tqdm, tqdm_pandas
import time
tqdm.pandas(mininterval = 3)
from pandas.api.types import infer_dtype

In [2]:
# define column SCIENCE as str type becuase the shortcut for computer science (inf) get interpreted as infinity/float which causes a mixed dtype
# changing all columns to category type to reduce memory usage if possible/makes sense

df_combined = pd.read_csv(r"E:\Organization_Project\files\Preprocessed_Combined_Meta_Dataset.csv", dtype={"SCIENCE": str})
df_combined["SCIENCE"] = df_combined["SCIENCE"].astype("category")
df_combined["KEYWORD"] = df_combined["KEYWORD"].astype("category")
df_combined["JOURNAL"] = df_combined["JOURNAL"].astype("category")

In [3]:
df_combined.head()

Unnamed: 0,DATE,AUTHOR,ID,JOURNAL,KEYWORD,SCIENCE,Abstract
0,2022-06-01,Li X,35655310.0,Journal of experimental & clinical cancer rese...,Biochemistry[Mesh Terms],che,background micrornas isomirs play important r...
1,2022-06-01,Kirk P,35655273.0,BMC biology,Biochemistry[Mesh Terms],che,background major route cell-to-cell signallin...
2,2022-01-01,Wong EL,35655449.0,Frontiers in public health,Biochemistry[Mesh Terms],che,background virtually invasive cervical cancer...
3,2022-11-01,Huang Y,35654504.0,Journal of environmental sciences (China),Biochemistry[Mesh Terms],che,nanoscale bismuth oxyiodide widely studied app...
4,2022-06-01,Tayal U,35654493.0,Journal of the American College of Cardiology,Biochemistry[Mesh Terms],che,background dilated cardiomyopathy final commo...


In [4]:
df_combined.shape

(3237571, 7)

In [5]:
# reading in edlinger dict
# creating sentiment Analyzer object
SIA = SentimentIntensityAnalyzer()

edlinger_dict = {

 	'beneficial': 1.9,  
        'benefit': 2.0,  
    'benefits': 1.6,  
    'better': 1.9, 
    	'effectively': 1.9,  
     'efficient': 1.8,
        'excellent': 2.7,
    'greater': 1.5, 
            'help': 1.7, 
    'importance': 1.5,     
    'important': 0.8,
        'improve': 1.9,    
    'improved': 2.1,  
    	'improvement': 2.0, 
    'improvements': 1.3,
        'improving': 1.8,
    'increase': 1.3, 
         'increased': 1.1,
    'interest': 2.0,    
        'novel': 1.3, 
        'optimal': 1.5,  
     'opportunity': 1.8,
        'promise': 1.3,  
        'progress': 1.8,   
    'significance': 1.1,     
    'significantly': 0.8,
        'strong': 2.3,
        'strongly': 1.1,    
    'successfully': 2.2,
            'supported': 1.3, 
    'supporting': 1.9,
        'supports': 1.5,    
    'useful': 1.9,
        'valuable': 2.1, 
    'well': 1.1
}

SIA.lexicon = {}
SIA.lexicon.update(edlinger_dict)

In [6]:
len(SIA.lexicon)

35

In [7]:
# this function calculates sentiment score for whole abstracts

def whole_sentiment(df):

    df["pos_whole"] = df["Abstract"].progress_apply(lambda x: SIA.polarity_scores(x))
    df["neg_whole"] = df["pos_whole"].apply(lambda x: x["neg"])
    df["neu_whole"] = df["pos_whole"].apply(lambda x: x["neu"])
    df["compound_whole"] = df["pos_whole"].apply(lambda x: x["compound"])
    df["pos_whole"] = df["pos_whole"].apply(lambda x: x["pos"])

    return df


In [10]:
# the above function is applied to the dataframe
# and the file is saved to the entered path

df_combined_full_vader = whole_sentiment(df_combined)
df_combined_full_vader[["DATE","pos_whole",
            "neg_whole", "neu_whole", "compound_whole","ID"]].to_csv(
            r"E:\Organization_Project\files\df_vader_whole_meta_combined.csv",
            index=False)

100%|██████████| 3237571/3237571 [56:05<00:00, 961.95it/s] 


In [9]:
# this function splits the abstracts into three
# parts and calculates sentiment scores for each third
def split_string(s):
    parts = s.split()
    return ' '.join(parts[:len(parts)//3]), ' '.join(parts[len(parts)//3:((len(parts)//3)*2)]), ' '.join(parts[((len(parts)//3)*2):])

def split_sentiment(df):

    #spliting abstract and saving each third into a new column
    df['split_abstracts_1'], df['split_abstracts_2'], df['split_abstracts_3'] = zip(*df['Abstract'].progress_apply(split_string))

    # old code, less efficient
    # df["Abstract"] = df["Abstract"].progress_apply(lambda x: x.split())
    # splitting abstracts into thirds
    # df["split_abstracts_1"] = df["Abstract"].progress_apply(lambda x: " ".join(x[:len(x)//3]))
    # df["split_abstracts_2"] = df["Abstract"].progress_apply(lambda x: " ".join(x[len(x)//3:((len(x)//3)*2)]))
    # df["split_abstracts_3"] = df["Abstract"].progress_apply(lambda x: " ".join(x[((len(x)//3)*2):]))

    # calcualting sentiment scores for the split abstracts
    df["pos_1"] = df["split_abstracts_1"].progress_apply(lambda x: SIA.polarity_scores(x))
    df["neg_1"] = df["pos_1"].apply(lambda x: x["neg"])
    df["neu_1"] = df["pos_1"].apply(lambda x: x["neu"])
    df["compound_1"] = df["pos_1"].apply(lambda x: x["compound"])
    df["pos_1"] = df["pos_1"].apply(lambda x: x["pos"])

    df["pos_2"] = df["split_abstracts_2"].progress_apply(lambda x: SIA.polarity_scores(x))
    df["neg_2"] = df["pos_2"].apply(lambda x: x["neg"])
    df["neu_2"] = df["pos_2"].apply(lambda x: x["neu"])
    df["compound_2"] = df["pos_2"].apply(lambda x: x["compound"])
    df["pos_2"] = df["pos_2"].apply(lambda x: x["pos"])

    df["pos_3"] = df["split_abstracts_3"].progress_apply(lambda x: SIA.polarity_scores(x))
    df["neg_3"] = df["pos_3"].apply(lambda x: x["neg"])
    df["neu_3"] = df["pos_3"].apply(lambda x: x["neu"])
    df["compound_3"] = df["pos_3"].apply(lambda x: x["compound"])
    df["pos_3"] = df["pos_3"].apply(lambda x: x["pos"])

    return df

In [11]:
# the above function is applied to the dataframe
# and the file is saved to the entered path
df_combined_full_vader = split_sentiment(df_combined)
df_combined_full_vader[["DATE",
             "pos_1", "pos_2", "pos_3", "neg_1", "neg_2",
             "neg_3", "neu_1", "neu_2", "neu_3", "compound_1",
             "compound_2", "compound_3", "ID"]].to_csv(
             r"E:\Organization_Project\files\df_vader_thirds_meta_combined.csv",
            index=False)

100%|██████████| 3237571/3237571 [00:52<00:00, 61160.67it/s]
100%|██████████| 3237571/3237571 [22:23<00:00, 2409.31it/s]
100%|██████████| 3237571/3237571 [21:28<00:00, 2512.56it/s]
100%|██████████| 3237571/3237571 [20:57<00:00, 2575.52it/s]
