# Data Mining CS 619, Spring 2018 - Eleonora Renz

### Week 6 - Chapter 6

## Social Media Insight using Naive Bayes - <br> ch6_extracting-word-counts

#### Extracting word counts

In [1]:
import spacy
from sklearn.base import TransformerMixin

# Create a spaCy parser
nlp = spacy.load('en_core_web_sm') # Download successful but linking failed, so called by its full name

class BagOfWords(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        results = []
        for document in X:
            row = {}
            for word in list(nlp(document, tag=False, parse=False, entity=False)):
                if len(word.text.strip()):
                    row[word.text] = True
            results.append(row)
        return results

#### Putting it all together

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import BernoulliNB
import os

input_filename = os.path.join(os.path.expanduser("~"), "Desktop", "DataMining_Spring2018", "Data", "twitter", "python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "Desktop", "DataMining_Spring2018", "Data", "twitter", "python_classes.json")

In [3]:
import json

tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0: continue
        tweets.append(json.loads(line)['text'])

with open(labels_filename) as inf:
    labels = json.load(inf)

# Ensure only classified tweets are loaded
tweets = tweets[:len(labels)]
assert len(tweets) == len(labels)

In [4]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag-of-words', BagOfWords()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB()) ])

#### Evaluating using F1-score

The F1-score is defined on a per-class basis and is based on two concepts:
the precision and recall. The precision is the percentage of all the samples
that were predicted as belonging to a specific class, that were actually
from that class. The recall is the percentage of samples in the dataset that
are in a class and actually labeled as belonging to that class.

In [5]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(pipeline, tweets, labels, scoring='f1')



TypeError: __call__() got an unexpected keyword argument 'tag'

#### Getting useful features from models

In [None]:
model = pipeline.fit(tweets, labels)

nb = model.named_steps['naive-bayes']
feature_probabilities = nb.feature_log_prob_

top_features = np.argsort(-nb.feature_log_prob_[1])[:50]

dv = model.named_steps['vectorizer']

In [None]:
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index],
         np.exp(feature_probabilities[1][feature_index]))