In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
from flair.models import TextClassifier
from flair.data import Sentence

[nltk_data] Downloading package punkt to /home/tourist800/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tourist800/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/tourist800/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/tourist800/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [2]:
def create_df(folder_path):
    files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
    
    df_list = []
    
    for file in files:
        file_path_open = folder_path+ "/" + file
        with open(file_path_open, encoding="utf8", errors='ignore') as f:
            file_read = f.read()
            
            text_split = file_read.split(".")
            for i in range(0,len(text_split),1):
                remove_newline = text_split[i].replace('\n', ' ').replace('\r', '')
                if len(remove_newline.split(" "))>2:
                    df_list.append([remove_newline,file.replace('.txt','')])
                
    return df_list

In [3]:
folder_path = 'Dataset'

In [4]:
pd_list = create_df(folder_path)

In [5]:
len(pd_list)

564

In [6]:
df = pd.DataFrame(pd_list, columns = ['Sentence', 'Paper_name'])

In [7]:
df

Unnamed: 0,Sentence,Paper_name
0,Bard Senior Projects Fall 2020 Bard Coll...,9
1,edu Follow this and additional w...,9
2,edu/senproj_f2020 Part of the Psychology Comm...,9
3,"Recommended Citation Bossard, Grant Sean, ""...",9
4,Senior Projects Fall 2020,9
...,...,...
559,"Specifically, the research wants to see if us...",9
560,and reduces self-esteem,9
561,"Also, the study slants to see if not using an...",9
562,and increase self- esteem,9


In [8]:
df.Paper_name.value_counts()

9    564
Name: Paper_name, dtype: int64

In [9]:
df.to_csv('research_paper_sentiment_analysis.csv')

## Data preprocessing

In [10]:
def clean(text):
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text

In [11]:
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

In [12]:
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

In [13]:
df['Sentence_clean'] = df['Sentence'].apply(clean)

In [14]:
df['POS_tagged'] = df['Sentence_clean'].apply(token_stop_pos)

In [15]:
df['Lemma'] = df['POS_tagged'].apply(lemmatize)

In [16]:
df.head(3)

Unnamed: 0,Sentence,Paper_name,Sentence_clean,POS_tagged,Lemma
0,Bard Senior Projects Fall 2020 Bard Coll...,9,Bard Senior Projects Fall Bard College Bard D...,"[(Bard, n), (Senior, n), (Projects, n), (Fall,...",Bard Senior Projects Fall Bard College Bard ...
1,edu Follow this and additional w...,9,edu Follow this and additional works at https ...,"[(edu, r), (Follow, n), (additional, a), (work...",edu Follow additional work http digitalcommons
2,edu/senproj_f2020 Part of the Psychology Comm...,9,edu senproj f Part of the Psychology Commons o...,"[(edu, n), (senproj, n), (f, a), (Part, n), (P...",edu senproj f Part Psychology Commons work l...


In [17]:
fin_data = pd.DataFrame(df[['Sentence', 'Paper_name', 'Lemma']])

In [18]:
analyzer = SentimentIntensityAnalyzer()

In [19]:
def vadersentimentanalysis(review):
    vs = analyzer.polarity_scores(review)
    return vs['compound']

In [20]:
# function to analyse
def vader_analysis(compound):
    if compound >= 0.5:
        return 'Positive'
    elif compound <= -0.5 :
        return 'Negative'
    else:
        return 'Neutral'

In [21]:
fin_data['Vader_Sentiment'] = fin_data['Lemma'].apply(vadersentimentanalysis)

In [22]:
fin_data['Vader_Analysis'] = fin_data['Vader_Sentiment'].apply(vader_analysis)

In [23]:
fin_data

Unnamed: 0,Sentence,Paper_name,Lemma,Vader_Sentiment,Vader_Analysis
0,Bard Senior Projects Fall 2020 Bard Coll...,9,Bard Senior Projects Fall Bard College Bard ...,0.3612,Neutral
1,edu Follow this and additional w...,9,edu Follow additional work http digitalcommons,0.0000,Neutral
2,edu/senproj_f2020 Part of the Psychology Comm...,9,edu senproj f Part Psychology Commons work l...,0.4404,Neutral
3,"Recommended Citation Bossard, Grant Sean, ""...",9,Recommended Citation Bossard Grant Sean Effe...,0.5106,Positive
4,Senior Projects Fall 2020,9,Senior Projects Fall,0.0000,Neutral
...,...,...,...,...,...
559,"Specifically, the research wants to see if us...",9,Specifically research want see usinp social ...,-0.5267,Negative
560,and reduces self-esteem,9,reduces self esteem,0.0000,Neutral
561,"Also, the study slants to see if not using an...",9,Also study slant see use social medium platf...,-0.5267,Negative
562,and increase self- esteem,9,increase self esteem,0.3182,Neutral


In [24]:
vader_counts = fin_data['Vader_Analysis'].value_counts()
vader_counts

Neutral     429
Negative     70
Positive     65
Name: Vader_Analysis, dtype: int64

In [25]:
fin_data.Vader_Analysis.value_counts().reset_index(name='Sum of Vader_Analysis')

Unnamed: 0,index,Sum of Vader_Analysis
0,Neutral,429
1,Negative,70
2,Positive,65
