# **NRC Emotion Lexicon**

In [2]:
import pandas as pd

**Pre-process**

In [3]:
full_df = pd.read_csv('cricket_dataset_short.csv')
full_df.head()

# Lower Casing --> creating new column called text_lower
full_df['Headlines_New']  = full_df['tags'].str.lower()
full_df['Headlines_New'].head()

full_df['News_New']  = full_df['description'].str.lower()
full_df['News_New'].head()

#removing punctuation, creating a new column called 'text_punct]'
full_df['Headlines_New'] = full_df['Headlines_New'].str.replace('[^\w\s]','')
full_df['Headlines_New'].head()
full_df['News_New'] = full_df['News_New'].str.replace('[^\w\s]','')
full_df['News_New'].head()

#Importing stopwords from nltk library
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
# Function to remove the stopwords
def stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
# Applying the stopwords to 'text_punct' and store into 'text_stop'
full_df["Headlines_StopWords"] = full_df["Headlines_New"].apply(stopwords)
full_df["Headlines_StopWords"].head()
full_df["News_StopWords"] = full_df["News_New"].apply(stopwords)
full_df["News_StopWords"].head()

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])


full_df["Headlines_lemmatized"] = full_df["Headlines_StopWords"].apply(lambda text: lemmatize_words(text))
full_df["Headlines_lemmatized"].head()
full_df["News_lemmatized"] = full_df["News_StopWords"].apply(lambda text: lemmatize_words(text))
full_df["News_lemmatized"].head()

print(full_df["Headlines_lemmatized"])
print(full_df["News_lemmatized"])

df1 = full_df[["Headlines_lemmatized"]]
df1["News_head"] = df1["Headlines_lemmatized"].astype(str)

df2 = full_df[["News_lemmatized"]]
df2["News"] = df2["News_lemmatized"].astype(str)


df2["News_Combine"] = df1["News_head"] + df2["News"] 


sentences=df2[['News_Combine']]
sentences = sentences[:273]
print("Print after preprocessing")
print("____________________________________________")
print(sentences)
print(sentences.size)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
0      sport mumbai rahane ball bat chase third man l...
1      sport length delivery delivery leg short wrist...
2      sport shreyas gopal drifter cricket india raha...
3      sport cut pietersen shot leg foot cut shot pad...
4      sport foot cut ravi shaz kp pitch short front ...
                             ...                        
196    sport cricket india cricket karn sharma ball s...
197    sport eoin morgan ashish reddy b watson ashish...
198    sport eoin morgan ashish reddy cricket indian ...
199    sport eoin morgan ashish reddy t20 skipper ind...
200    sport eoin morgan ashish reddy cricket india

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
df=pd.DataFrame()
df['full_article'] = sentences['News_Combine']
print(df.columns)
df['full_article'] = df['full_article'].str.replace('\d+', '')
print(df['full_article'])

Index(['full_article'], dtype='object')
0      sport mumbai rahane ball bat chase third man l...
1      sport length delivery delivery leg short wrist...
2      sport shreyas gopal drifter cricket india raha...
3      sport cut pietersen shot leg foot cut shot pad...
4      sport foot cut ravi shaz kp pitch short front ...
                             ...                        
196    sport cricket india cricket karn sharma ball s...
197    sport eoin morgan ashish reddy b watson ashish...
198    sport eoin morgan ashish reddy cricket indian ...
199    sport eoin morgan ashish reddy t skipper india...
200    sport eoin morgan ashish reddy cricket indian ...
Name: full_article, Length: 201, dtype: object


Load

In [5]:
filepath = "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=45, sep='\t', keep_default_na=False)
emolex_df = emolex_df.pivot(index='word', columns='emotion', values='association').reset_index()
emolex_df = emolex_df.fillna(0)
emolex_df.head(12)

emotion,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,abandonment,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,abate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,abatement,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,abba,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,abbot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,abbreviate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,abbreviation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,abdomen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,abdominal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,abduction,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
matrix = vec.fit_transform(df['full_article'].values.astype('U'))
vocab = vec.get_feature_names()
wordcount_df = pd.DataFrame(matrix.toarray(), columns=vocab)
wordcount_df.head()

Unnamed: 0,aamir,aaron,abdomen,able,absolute,absolutely,acknowledge,across,action,actually,acutely,adapt,add,adjudge,adjudged,advance,afford,africa,african,africana,afrikaner,againp,agarwal,agarwals,age,aggressive,agoishant,aim,aint,air,airaxar,airc,airchawla,airfaulkner,airpardeep,ajay,ajinkya,al,allow,allowp,...,wicketstoinis,wide,wideaxar,widec,widejohnson,wides,widevinay,width,william,willow,win,wisden,without,wonder,wont,wood,work,workrp,world,worse,would,wrist,wrists,wristy,wrong,wrongfooted,xi,yadav,yank,yard,year,yes,yet,yorker,younis,zaheer,zaheers,zero,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
emolex_df.word.head(3)

0    abandonment
1          abate
2      abatement
Name: word, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# I only want you to look for words in the emotional lexicon
# because we don't know what's up with the other words
vec = TfidfVectorizer(vocabulary=emolex_df.word,
                      use_idf=False, 
                      norm='l1') # ELL - ONE
matrix = vec.fit_transform(df.full_article.values.astype('U'))
vocab = vec.get_feature_names()
wordcount_df = pd.DataFrame(matrix.toarray(), columns=vocab)
wordcount_df.head()

Unnamed: 0,abandonment,abate,abatement,abba,abbot,abbreviate,abbreviation,abdomen,abdominal,abduction,aberrant,aberration,abeyance,abhor,abhorrent,abide,ability,abject,ablation,ablaze,abnormal,aboard,abode,abolish,abolition,abominable,abomination,aboriginal,abort,abortion,abortive,abound,abovementioned,abrasion,abroad,abrogate,abrupt,abruptly,abscess,absence,...,yearly,yearn,yearning,years,yeast,yell,yellow,yellows,yelp,yeoman,yesterday,yesteryear,yew,yield,yielding,yogi,yoke,yolk,yon,yonder,young,younger,youth,zany,zap,zeal,zealot,zealous,zebra,zenith,zephyr,zeppelin,zest,zip,zodiac,zone,zoo,zoological,zoology,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#df['negative'] = wordcount_df[['attack', 'force', 'threat', 'assault']].sum(axis=1)
#df.head(5)

# **Unique Emotion List**
'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust'

---
---

# ***Joy*** in custom dataset

In [10]:
# Get your list of angry words
joy = emolex_df[emolex_df.joy == 1]['word']
joy.head()

45       absolution
57        abundance
58         abundant
81         accolade
84    accompaniment
Name: word, dtype: object

In [11]:
df['joy'] = wordcount_df[joy].sum(axis=1)
df.sort_values(by='joy',ascending=False).head(10)

Unnamed: 0,full_article,joy
131,sport pardeep sahu sahu cricket india jadeja c...,0.130435
5,sport sweeper bat short punch pad pull push fu...,0.108108
171,sport maxwell cricket bowling dwayne bravo dis...,0.102564
200,sport eoin morgan ashish reddy cricket indian ...,0.086957
40,sport cricket harbhajan singh harbhajan punch ...,0.085714
13,sweeper plessis surname extra length delivery ...,0.083333
149,sport fielder sport team third umpire dismissa...,0.083333
115,sport russell cricket batting nair dismissal r...,0.083333
187,sport vijay fielder cricket indian film murali...,0.08
0,sport mumbai rahane ball bat chase third man l...,0.08


# ***Positive*** in custom dataset

In [12]:
# Get your list of angry words
positive = emolex_df[emolex_df.positive == 1]['word']
positive.head()

3               abba
16           ability
32    abovementioned
44          absolute
45        absolution
Name: word, dtype: object

In [13]:
df['positive'] = wordcount_df[positive].sum(axis=1)
df.sort_values(by='positive',ascending=False).head(10)

Unnamed: 0,full_article,joy,positive
175,sport dwayne bravo sweeper indian premier leag...,0.0,0.34375
108,sport forward negi length delivery delivery st...,0.052632,0.342105
13,sweeper plessis surname extra length delivery ...,0.083333,0.333333
167,sport saha delivery spell square fifth whip go...,0.04,0.32
99,sport morris b chawla invincibles morris googl...,0.045455,0.318182
14,cricket plessis surnames extra sport length de...,0.057143,0.314286
123,sport ipl dwayne bravo karthik backyardmohit s...,0.068966,0.310345
197,sport eoin morgan ashish reddy b watson ashish...,0.0,0.307692
199,sport eoin morgan ashish reddy t skipper india...,0.0,0.304348
5,sport sweeper bat short punch pad pull push fu...,0.108108,0.297297
