In [1]:
import os
tweet_filename = os.path.join(os.path.expanduser("~"), "Desktop", "Data Mining - Python",
                              "Learning-Data-Mining-with-Python-master", "Chapter 6", "replicable_python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "Desktop", "Data Mining - Python",
                              "Learning-Data-Mining-with-Python-master", "Chapter 6", "replicable_python_classes.json")

In [2]:
import json
tweets = []
with open(tweet_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line)['text'])
print("Loaded {} tweets".format(len(tweets)))

Loaded 120 tweets


In [3]:
tweets[:10]

['Is @common_squirrel just an awesome Python script somewhere?',
 'RT @Python_Agent: Top 30 python projects in github http://t.co/VX95Q9bEeh #python',
 'Forget Attack on Titan, Monty Python http://t.co/whc5NpQxUK',
 'Why did the python do national service? He was coiled up!',
 '#python tap.py 1.2: Tools for working with the Test Anything Protocol (TAP) http://t.co/HKpt9Cfud9',
 '今日のウチのPythonさん不機嫌だよ！',
 '#Nuitka is a compiler for #python http://t.co/2BlncADiTt',
 'ブラッドパイソンかわいいわぁ。マジかわいいわぁ。マジホント俺のタ・イ・プ（はあと）',
 'I could have done with this two weeks ago. http://t.co/dkMmuUQbdl The outcome of my own battle is summarised at https://t.co/I02979AyTS',
 'Really getting the hang of all this. So many moving parts but all fit together nicely. #python #flask #nginx #gunicorn #supervisor #AWS']

In [4]:
with open(labels_filename) as inf:
    labels = json.load(inf)
print("Loaded {} labels".format(len(labels)))

Loaded 120 labels


In [5]:
from sklearn.base import TransformerMixin
import nltk
from nltk import word_tokenize

class NLTKBOW(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [{word: True for word in word_tokenize(document)} for document in X]
    

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag of words', NLTKBOW()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB())])

In [7]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(pipeline, tweets, labels, scoring = 'f1')



In [8]:
scores

array([0.84210526, 0.86486486, 0.87804878])

In [9]:
print([{word: True for word in word_tokenize(document)} for document in tweets])

[{'Is': True, '@': True, 'common_squirrel': True, 'just': True, 'an': True, 'awesome': True, 'Python': True, 'script': True, 'somewhere': True, '?': True}, {'RT': True, '@': True, 'Python_Agent': True, ':': True, 'Top': True, '30': True, 'python': True, 'projects': True, 'in': True, 'github': True, 'http': True, '//t.co/VX95Q9bEeh': True, '#': True}, {'Forget': True, 'Attack': True, 'on': True, 'Titan': True, ',': True, 'Monty': True, 'Python': True, 'http': True, ':': True, '//t.co/whc5NpQxUK': True}, {'Why': True, 'did': True, 'the': True, 'python': True, 'do': True, 'national': True, 'service': True, '?': True, 'He': True, 'was': True, 'coiled': True, 'up': True, '!': True}, {'#': True, 'python': True, 'tap.py': True, '1.2': True, ':': True, 'Tools': True, 'for': True, 'working': True, 'with': True, 'the': True, 'Test': True, 'Anything': True, 'Protocol': True, '(': True, 'TAP': True, ')': True, 'http': True, '//t.co/HKpt9Cfud9': True}, {'今日のウチのPythonさん不機嫌だよ！': True}, {'#': True, 'N

In [10]:
model = pipeline.fit(tweets, labels)

In [15]:
nb = model.named_steps['naive-bayes']
dv = model.named_steps['vectorizer']

In [23]:
import numpy as np
top_features = np.argsort(nb.feature_log_prob_[1])[:50]

In [25]:
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index])

0 😍😍
1 Let
2 Leather
3 Lady
4 near
5 LaLiga
6 LET
7 Like
8 LEARN
9 JUST
10 It
11 object
12 IronPython
13 office
14 olvidado
15 needs
16 national
17 Look
18 MessiQuote
19 los
20 PYTHON🐍
21 PYTHON
22 PHUKET
23 mechashiorina
24 OUT
25 ON
26 OMMMGG
27 mkmnsh_
28 monty
29 NYDailyNews
30 Moore
31 Monty
32 moron
33 mura_cin
34 Honestly
35 He
36 HEY
37 Grail
38 Delhi
39 Date
40 DND
41 Create
42 python_octopus
43 qué
44 Canada
45 Camuto
46 real
47 By
48 Burmese
49 Brown
