# Named Entity Recognition (NER) on Financial-News

## Import

In [2]:
from nltk.tokenize import word_tokenize
import os
from nltk import *
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tag.stanford import StanfordNERTagger


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stefa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocessing dataset

In [48]:
df = pd.read_csv('all-data.csv', encoding="latin-1",
                 names=['sentiment', 'headline'])
df = df.head(10)
df.head()

Unnamed: 0,sentiment,headline
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [49]:
print("original shape: ", df.shape)
df = df.drop_duplicates()
print("after drop duplicates shape: ", df.shape)
dd_dn = df.dropna()
print("after drop null shape: ", df.shape)

original shape:  (10, 2)
after drop duplicates shape:  (10, 2)
after drop null shape:  (10, 2)


In [50]:
df.isnull().sum()

sentiment    0
headline     0
dtype: int64

## Stanford NER

In [51]:
java_path = "C:/Program Files/Java/jre-1.8/bin/java.exe"
os.environ['JAVAHOME'] = java_path

model = "C:/Users/stefa/Desktop/financial-news-analysis/stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz"
jar = "C:/Users/stefa/Desktop/financial-news-analysis/stanford-ner/stanford-ner-4.2.0.jar"

ner_tagger = StanfordNERTagger(model, jar, encoding="utf-8")

testo = "According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing"

words = word_tokenize(testo)
classified_words = ner_tagger.tag(words)
classified_words

[('According', 'O'),
 ('to', 'O'),
 ('Gran', 'ORGANIZATION'),
 (',', 'O'),
 ('the', 'O'),
 ('company', 'O'),
 ('has', 'O'),
 ('no', 'O'),
 ('plans', 'O'),
 ('to', 'O'),
 ('move', 'O'),
 ('all', 'O'),
 ('production', 'O'),
 ('to', 'O'),
 ('Russia', 'LOCATION'),
 (',', 'O'),
 ('although', 'O'),
 ('that', 'O'),
 ('is', 'O'),
 ('where', 'O'),
 ('the', 'O'),
 ('company', 'O'),
 ('is', 'O'),
 ('growing', 'O')]

In [53]:
from itertools import groupby
def extract_entities(text):
    tokenized_text = word_tokenize(text)
    classified_text = ner_tagger.tag(tokenized_text)

    entities = []
    labels = []
    for tag, chunk in groupby(classified_text, lambda x: x[1]):
        if tag != "O":
            s = ' '.join(w for w, t in chunk)
            s = s + "-" + tag
            entities.append(s)
            labels.append(tag)
    return entities


# Apply the entity extraction function to each row of the DataFrame
df['entities'] = df['headline'].apply(extract_entities)

In [55]:
df.head(10)

Unnamed: 0,sentiment,headline,entities
0,neutral,"According to Gran , the company has no plans t...","[Gran-ORGANIZATION, Russia-LOCATION]"
1,neutral,Technopolis plans to develop in stages an area...,[]
2,negative,The international electronic industry company ...,"[Elcoteq-ORGANIZATION, Tallinn-LOCATION]"
3,positive,With the new production plant the company woul...,[]
4,positive,According to the company 's updated strategy f...,"[20 % -40 %-PERCENT, 10 % -20 %-PERCENT]"
5,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,[]
6,positive,"For the last quarter of 2010 , Componenta 's n...","[last quarter of 2010-DATE, Componenta-ORGANIZ..."
7,positive,"In the third quarter of 2010 , net sales incre...","[third quarter of 2010-DATE, 5.2 %-PERCENT, 34..."
8,positive,Operating profit rose to EUR 13.1 mn from EUR ...,"[EUR-ORGANIZATION, EUR-ORGANIZATION, 2007-DATE..."
9,positive,"Operating profit totalled EUR 21.1 mn , up fro...","[EUR-ORGANIZATION, 2007-DATE, 9.7 %-PERCENT]"
