# Data Mining CS 619, Spring 2018 - Eleonora Renz

### Week 6 - Chapter 6

## Social Media Insight using Naive Bayes - <br> ch6_extracting-word-counts

#### Extracting word counts

In [1]:
import spacy
import numpy as np
from sklearn.base import TransformerMixin

# Create a spaCy parser
nlp = spacy.load('en_core_web_sm') # Download successful but linking failed, so called by its full name

class BagOfWords(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        results = []
        for document in X:
            row = {}
            for word in list(nlp(document)):
                if len(word.text.strip()):
                    row[word.text] = True
            results.append(row)
        return results

#### Putting it all together

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import BernoulliNB
import os

input_filename = os.path.join(os.path.expanduser("~"), "Desktop", "DataMining_Spring2018", "Data", "twitter", "python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "Desktop", "DataMining_Spring2018", "Data", "twitter", "python_classes.json")

In [3]:
import json

tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0: continue
        tweets.append(json.loads(line)['text'])

with open(labels_filename) as inf:
    labels = json.load(inf)

# Ensure only classified tweets are loaded
tweets = tweets[:len(labels)]
assert len(tweets) == len(labels)

In [4]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag-of-words', BagOfWords()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB()) ])

#### Evaluating using F1-score

The F1-score is defined on a per-class basis and is based on two concepts:
the precision and recall. The precision is the percentage of all the samples
that were predicted as belonging to a specific class, that were actually
from that class. The recall is the percentage of samples in the dataset that
are in a class and actually labeled as belonging to that class.

In [5]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipeline, tweets, labels, scoring='f1')

#### Getting useful features from models

In [6]:
model = pipeline.fit(tweets, labels)

nb = model.named_steps['naive-bayes']
feature_probabilities = nb.feature_log_prob_

top_features = np.argsort(-nb.feature_log_prob_[1])[:50]

dv = model.named_steps['vectorizer']

In [7]:
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index],
         np.exp(feature_probabilities[1][feature_index]))

0 : 0.853658536585
1 RT 0.682926829268
2 Python 0.536585365854
3 … 0.487804878049
4 # 0.463414634146
5 to 0.292682926829
6 in 0.268292682927
7 , 0.268292682927
8 a 0.268292682927
9 the 0.219512195122
10 and 0.219512195122
11 . 0.219512195122
12 Programming 0.19512195122
13 of 0.170731707317
14 from 0.146341463415
15 & 0.121951219512
16 for 0.121951219512
17 Machine 0.121951219512
18 Data 0.121951219512
19 using 0.121951219512
20 ; 0.121951219512
21 amp 0.121951219512
22 programming 0.121951219512
23 ) 0.121951219512
24 ( 0.121951219512
25 Media 0.0975609756098
26 ! 0.0975609756098
27 DeepLearning 0.0975609756098
28 learning 0.0975609756098
29 @kid_OYO 0.0975609756098
30 Financial 0.0975609756098
31 marketing 0.0975609756098
32 https://t.co/Wecxebz0B1 0.0975609756098
33 Creative 0.0975609756098
34 modeling 0.0975609756098
35 python 0.0975609756098
36 @zainabSULE 0.0975609756098
37 Business 0.0975609756098
38 on 0.0975609756098
39 Go 0.0975609756098
40 Analytics 0.0975609756098
41 @Masha

For week 7 we will store our model:

In [8]:
from sklearn.externals import joblib

output_filename = os.path.join(os.path.expanduser("~"), "Desktop", "DataMining_Spring2018", "Models", "twitter", "python_context.pkl")
joblib.dump(model, output_filename)

['C:\\Users\\Ellomarshmallow\\Desktop\\DataMining_Spring2018\\Models\\twitter\\python_context.pkl']