# Named Entity Recognition

In [1]:
import pandas as pd
import nltk

# note: use can also use any of the classical ML models
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [2]:
tokens = ['Barack', 'Obama', 'visited', 'the', 'Eiffel', 'Tower']
labels = ['PER', 'PER', 'O', 'O', 'LOC', 'LOC']
features = [
    {'word': 'Barack', 'pos': 'NNP', 'prev_word': 'O', 'next_word': 'Obama'},
    {'word': 'Obama', 'pos': 'NNP', 'prev_word': 'Barack', 'next_word': 'visited'},
    {'word': 'visited', 'pos': 'VBD', 'prev_word': 'Obama', 'next_word': 'the'},
    {'word': 'the', 'pos': 'DT', 'prev_word': 'visited', 'next_word': 'Eiffel'},
    {'word': 'Eiffel', 'pos': 'NNP', 'prev_word': 'the', 'next_word': 'Tower'},
    {'word': 'Tower', 'pos': 'NNP', 'prev_word': 'Eiffel', 'next_word': 'O'}
]

pd.DataFrame(features)

Unnamed: 0,word,pos,prev_word,next_word
0,Barack,NNP,O,Obama
1,Obama,NNP,Barack,visited
2,visited,VBD,Obama,the
3,the,DT,visited,Eiffel
4,Eiffel,NNP,the,Tower
5,Tower,NNP,Eiffel,O


In [3]:
# convert features to vectors embedding
vectorizer = DictVectorizer()
X = vectorizer.fit_transform(features)

nb_classifier = MultinomialNB()
nb_classifier.fit(X, labels)


val = {'word': 'Tower', 'pos': 'NNP', 'prev_word': 'Eiffel', 'next_word': 'O'}
val = vectorizer.transform(val)
nb_classifier.predict(val)

array(['LOC'], dtype='<U3')

other methods for NER using statistcal approach are

1. [Hidden Markov Models](https://www.kaggle.com/code/annsanababy/hidden-markov-model-hmm-on-ner-dataset)


2. [Conditional Random Fields](https://medium.com/data-science-in-your-pocket/named-entity-recognition-ner-using-conditional-random-fields-in-nlp-3660df22e95c)


# Text Classification

same approach as sentiment analysis

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
data = pd.read_csv('/content/drive/MyDrive/DSN/data/bbc-text.csv')
data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


### data cleaning

In [15]:
data['category'].unique().tolist()

['tech', 'business', 'sport', 'entertainment', 'politics']

In [16]:
stemmer = PorterStemmer()
words = stopwords.words("english")

data['cleaned_text'] = data['text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

data.head()

Unnamed: 0,category,text,cleaned_text
0,tech,tv future in the hands of viewers with home th...,tv futur hand viewer home theatr system plasma...
1,business,worldcom boss left books alone former worldc...,worldcom boss left book alon former worldcom b...
2,sport,tigers wary of farrell gamble leicester say ...,tiger wari farrel gambl leicest say rush make ...
3,sport,yeading face newcastle in fa cup premiership s...,yead face newcastl fa cup premiership side new...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelv raid box offic ocean twelv crime c...


In [17]:
x = data['cleaned_text']
y = data['category']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


lr = LogisticRegression()
lr.fit(X_train, y_train)

y_preds = lr.predict(X_test)

print(accuracy_score(y_test, y_preds))

0.9802513464991023


In [18]:
single_test = "ocean twelv raid box offic ocean twelv crime"
single_test = vectorizer.transform([single_test])
lr.predict(single_test)

array(['entertainment'], dtype=object)

# Machine Translation

[paper](https://www.cs.columbia.edu/~mcollins/pb.pdf)

# Speech Recognition

[article](https://jonathan-hui.medium.com/speech-recognition-gmm-hmm-8bb5eff8b196)

## Challenge of statistical modelling

Data Dependency: Statistical methods require large amounts of labeled data for training. For tasks with limited data, performance can be suboptimal.

Interpretability: Unlike rule-based systems, statistical models (especially deep learning models) can be seen as "black boxes," making it harder to understand why the model made a certain decision.

Feature Engineering: For some statistical methods (like SVMs or Naive Bayes), careful feature engineering is required to ensure high performance.