TweetNLP - Brown Word Clusters
------
**What it does**:
Maps word occurences to a cluster assignment, based on a clustering model built on 56m tweets.  
Source: http://www.cs.cmu.edu/~ark/TweetNLP/#resources

**Strengths**: Reduces sparsity of feature space compared to BOW

**Weaknesses**:

**Hyperparameters**: Since clusters are hierarchical, they can have different cutoffs to get a different number of clusters. The fully extended clustering has 1000 clusters based on a 16 digit binary string, and can be reduced to a 4 digit binary string. This is implemented in the `generate_cluster_string` argument of `cutoff`.

In [28]:
import pandas as pd
import csv
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
sts_gold = pd.read_csv('../data/sts_gold_v03/sts_gold_tweet.csv', index_col='id', sep=';')

In [30]:
sts_gold.head()

Unnamed: 0_level_0,polarity,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1467933112,0,the angel is going to miss the athlete this we...
2323395086,0,It looks as though Shaq is getting traded to C...
1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
1990283756,0,drinking a McDonalds coffee and not understand...
1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...


In [31]:
tweets = sts_gold['tweet']

In [32]:
clusterDict = {}
with open('../lexicons/tweetnlp_clusters/50mpaths2.txt', 'r') as f:
    reader=csv.reader(f,delimiter='\t')
    for cluster, word, freq in reader:
        clusterDict[word] = cluster

In [33]:
tt = TweetTokenizer()

In [34]:
def generate_cluster_string(string, tokenizer, cutoff=16):
    if cutoff < 4 or cutoff > 16:
        print("cutoff must be between 4 and 16")
        return None
    clusterList = []
    for token in tokenizer.tokenize(string):
        try:
            token = token.lower()
            cluster = clusterDict[token][:cutoff]
            clusterList.append(cluster)
        except KeyError:
            clusterList.append('NOCLUSTER')
    return ' '.join(clusterList)

In [35]:
clustered = [generate_cluster_string(tweet, tt) for tweet in tweets]

In [36]:
cv = CountVectorizer()
clus = cv.fit_transform(clustered)

# use below for data frame
clus_df = pd.DataFrame(clus.toarray(), index=tweets.index, columns=cv.get_feature_names())

In [37]:
clus_df.head()

Unnamed: 0_level_0,0000,000100,000101,000110,0001110,0001111,001000,001001,0010100,001010100,...,1111111101110,11111111011110,11111111011111,1111111110,111111111100,111111111101,111111111110,1111111111110,1111111111111,nocluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1467933112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2323395086,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1467968979,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1990283756,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1988884918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.ensemble import VotingClassifier

from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report

In [39]:
models = [('DUMMY', DummyClassifier(strategy='most_frequent')),
          ('mNB' , MultinomialNB()),
          ('bNB' , BernoulliNB()),
          ('svc' , SVC(probability=True)),
          ('rf' , RandomForestClassifier()),
          ('lr' , LogisticRegressionCV())
         ]
models.append(('eclf', VotingClassifier(estimators=[models[i] for i in [1, 3, 4, 5]], voting='soft')))

In [None]:
print('{0}\t{1:<1}\t{2:<4}\t{3:<4}'.format("MODEL", "MEAN CV", "MIN CV", "MAX CV"))

for name, model in models:    
    X, Y = clus, (sts_gold['polarity'] == 4).ravel()
    
    if name == 'bNB':
        binarize = Binarizer()
        X = binarize.fit_transform(X)
    elif name == 'svc':
        ss = StandardScaler()
        X = X.toarray()
        X = ss.fit_transform(X)
        
    cv = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
    
    print('{0}\t{1:<3}\t{2:<4}\t{3:<4}'.format(name, round(cv.mean(), 4), round(cv.min(), 4), round(cv.max(), 4)))

MODEL	MEAN CV	MIN CV	MAX CV
DUMMY	0.6893	0.6887	0.6897
mNB	0.8618	0.8547	0.8719
bNB	0.8638	0.8554	0.8725
svc	0.7925	0.7647	0.8088
rf	0.7979	0.7882	0.8079
lr	0.852	0.826	0.8916
eclf	0.8589	0.8358	0.8892


