# Sklearn per applicazioni di NLP

In [1]:
from sklearn.datasets import fetch_20newsgroups

## Natural Language Processing

SpaCy: [https://spacy.io/](https://spacy.io/)

NLTK: [https://www.nltk.org/](https://www.nltk.org/)

In [14]:
import nltk

In [15]:
from nltk import word_tokenize

In [18]:
import string

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [7]:
dataset = fetch_20newsgroups(data_home='/Users/flint/Data/sklearn/', remove=('headers', 'footers', 'quotes'))

In [54]:
tokenizer = lambda x: [w for w in word_tokenize(x.lower()) if w not in string.punctuation]

## Term Frequency (TF)

$$
tf(w, d) = count(w) \in d
$$

In [55]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [56]:
cnt = CountVectorizer(tokenizer=tokenizer, token_pattern=None)
cnt.fit(dataset.data[:2000])

CountVectorizer(token_pattern=None,
                tokenizer=<function <lambda> at 0x7f9c5c70d310>)

In [57]:
tf = cnt.transform(dataset.data[:2000])

In [58]:
dataset.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

## Inverse Document Frequency (IDF)

$$
idf(w) = \log \frac{N}{\mid \{d : w \in d \} \mid}
$$

In [59]:
tfidfm = TfidfVectorizer(tokenizer=tokenizer, token_pattern=None)
tfidfm.fit(dataset.data[:2000])
tfidf = tfidfm.transform(dataset.data[:2000])

In [60]:
tfidf[0, tfidfm.vocabulary_['history']]

0.10892730473248623

### Measuring relevance

In [61]:
from collections import defaultdict

In [62]:
def relevance(document_id, model, matrix):
    tokens = tokenizer(dataset.data[document_id])
    word_score = {}
    for token in set(tokens):
        score = matrix[document_id, model.vocabulary_[token]]
        word_score[token] = score
    return sorted(word_score.items(), key=lambda x: -x[1])

## Data Classification

In [84]:
train = fetch_20newsgroups(
    data_home='/Users/flint/Data/sklearn/', 
    remove=('headers', 'footers', 'quotes'),
    subset='train'
)
test = fetch_20newsgroups(
    data_home='/Users/flint/Data/sklearn/', 
    remove=('headers', 'footers', 'quotes'),
    subset='test'
)

In [71]:
counter = defaultdict(lambda: 0)
for k in train.target:
    class_name = train.target_names[k]
    counter[class_name] += 1

In [85]:
m = TfidfVectorizer(tokenizer=tokenizer, token_pattern=None, min_df=50)
print('Calcolo i parametri di TfIdf e trasformo il train set')
tfidf_train = m.fit_transform(train.data)
print('Applico tfidf sul test set')
tfidf_test = m.transform(test.data)

Calcolo i parametri di TfIdf e trasformo il train set
Applico tfidf sul test set


In [86]:
tfidf_train.shape

(11314, 2890)

In [87]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [88]:
classifier = GaussianNB()
classifier.fit(tfidf_train.toarray(), train.target)
y_prediction = classifier.predict(tfidf_test.toarray())

In [89]:
y_prediction

array([ 4,  5,  2, ..., 15,  6, 15])

In [90]:
from sklearn.metrics import classification_report

In [91]:
print(classification_report(test.target, y_prediction))

              precision    recall  f1-score   support

           0       0.22      0.20      0.21       319
           1       0.29      0.21      0.24       389
           2       0.22      0.32      0.26       394
           3       0.31      0.33      0.32       392
           4       0.23      0.29      0.26       385
           5       0.46      0.33      0.38       395
           6       0.48      0.43      0.45       390
           7       0.41      0.26      0.32       396
           8       0.44      0.38      0.41       398
           9       0.24      0.47      0.32       397
          10       0.62      0.55      0.58       399
          11       0.49      0.57      0.53       396
          12       0.28      0.22      0.24       393
          13       0.48      0.30      0.37       396
          14       0.43      0.41      0.42       394
          15       0.37      0.39      0.38       398
          16       0.31      0.36      0.33       364
          17       0.53    

## Co-occorrenze
- Indice di occorrenze delle parole
-- Indice inverso : key: word value: lista dei documenti in cui compare word
- Misura di rivlevanza delle co-occorrenze che osserviamo

In [139]:
N = 2000
documents = train.data[:N]
inverted_index = defaultdict(lambda: set())
for i, doc in enumerate(documents):
    tokens = tokenizer(doc)
    for token in tokens:
        inverted_index[token].add(i)

In [140]:
words = [word for word, docs in inverted_index.items() if len(docs) > 20]

$$
MI(a, b) = \log \frac{P(a,b)}{P(a)P(b)}
$$

In [141]:
def p(word):
    c = len(inverted_index[word])
    return c / N

def pp(w1, w2):
    c = len(inverted_index[w1].intersection(inverted_index[w2]))
    return c / N

def mi(w1, w2):
    if p(w1)*p(w2) == 0:
        return 0
    else:
        return np.log(pp(w1, w2) / (p(w1)*p(w2)))

In [142]:
collector = []
for i, word1 in enumerate(words):
    for word2 in words[i+1:]:
        collector.append({
            'word1': word1,
            'word2': word2,
            'sim': mi(word1, word2)
        })

  return np.log(pp(w1, w2) / (p(w1)*p(w2)))


In [143]:
D = pd.DataFrame(collector)

In [144]:
D.head()

Unnamed: 0,word1,word2,sim
0,i,was,0.17719
1,i,wondering,0.35882
2,i,if,0.201099
3,i,anyone,0.217355
4,i,out,0.224648


In [145]:
D.shape

(831405, 3)

In [153]:
F = D[D.sim > 2.0]
F.head()

Unnamed: 0,word1,word2,sim
2874,wondering,appreciated,2.055725
3456,wondering,baseball,2.055725
14150,car,model,2.112883
14152,car,engine,2.371578
14672,car,cars,2.664315


In [154]:
F.shape

(44232, 3)

In [157]:
import networkx as nx

In [158]:
G = nx.Graph()
for i, row in D[D.sim > 2.0].iterrows():
    G.add_edge(row.word1, row.word2, sim=row.sim)

In [159]:
len(G.edges())

44232

In [160]:
from networkx.algorithms.community import greedy_modularity_communities

In [161]:
for community in greedy_modularity_communities(G, weight='sim'):
    print(community)

frozenset({'neither', 'night', 'parts', 'assuming', 'changed', 'quickly', 'differences', 'lives', 'months', 'treatment', 'strong', 'kind', 'million', 'numbers', 'giving', 'voice', 'books', 'jewish', 'realize', 'mr.', 'exactly', 'covered', 'cause', 'according', 'second', 'hands', 'administration', 'takes', 'political', 'protect', 'created', 'tell', 'experience', 'recall', 'fire', 'press', 'street', 'comment', 'average', 'christianity', 'cover', 'earth', 'jesus', 'crime', 'matter', 'thinking', 'killed', 'glad', 'women', 'allowed', 'others', 'btw', 'surprised', 'came', 'release', 'king', 'understand', 'presented', 'genocide', 'once', 'behind', 'save', 'talk', 'community', 'law', 'ground', 'lots', 'case', 'sold', 'calling', 'missing', 'whatever', 'difficult', 'living', 'april', 'friends', 'heaven', 'old', 'usually', 'mostly', 'again', 'gave', 'stay', 'eventually', 'letter', 'show', 'policy', 'fair', 'federal', 'clearly', 'possibly', 'across', 'hope', 'authority', 'today', 'responsibility',