In [0]:
#https://www.analyticsvidhya.com/blog/2017/01/sentiment-analysis-of-twitter-posts-on-chennai-floods-using-python/
import pandas as pd

In [0]:
data = '/Users/davidleifer/Desktop/grant_work/working/twitter-code/user-tweets/TwitterEnergyData/json-data/energy20170801T140040.json'
df = pd.read_json(data, lines=True)

In [0]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re, string
import nltk
tweets_texts = df["text"].tolist()
stopwords=stopwords.words('english')
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
def process_tweet_text(tweet):
    if tweet.startswith('@null'):
        return "[Tweet not available]"
    tweet = re.sub(r'\$\w*','',tweet) # Remove tickers
    tweet = re.sub(r'https?:\/\/.*\/\w*','',tweet) # Remove hyperlinks
    tweet = re.sub(r'['+string.punctuation+']+', ' ',tweet) # Remove puncutations like 's
    twtok = TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = twtok.tokenize(tweet)
    tokens = [i.lower() for i in tokens if i not in stopwords and len(i) > 2 and  
                                             i in english_vocab]
    return tokens
words = []
for tw in tweets_texts:
    words += process_tweet_text(tw)

In [0]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words, 5)
finder.apply_freq_filter(5)
print(finder.nbest(bigram_measures.likelihood_ratio, 10))

[('selection', 'ready'), ('ready', 'solar'), ('canvas', 'art'), ('art', 'painting'), ('canvas', 'painting'), ('type', 'contact'), ('fix', 'type'), ('solar', 'selection'), ('ready', 'selection'), ('selection', 'selection')]


In [0]:
cleaned_tweets = []
for tw in tweets_texts:
    words = process_tweet_text(tw)
    cleaned_tweet = " ".join(w for w in words if len(w) > 2 and 
w.isalpha()) #Form sentences of processed words
    cleaned_tweets.append(cleaned_tweet)
df['CleanTweetText'] = cleaned_tweets

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,3))  
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_tweets)  
feature_names = tfidf_vectorizer.get_feature_names() # num phrases  
from sklearn.metrics.pairwise import cosine_similarity  
dist = 1 - cosine_similarity(tfidf_matrix)  
print(dist) 

from sklearn.cluster import KMeans
num_clusters = 3  
km = KMeans(n_clusters=num_clusters)  
km.fit(tfidf_matrix)  
clusters = km.labels_.tolist()  
df['ClusterID'] = clusters  
print(df['ClusterID'].value_counts())

[[ -2.22044605e-16   1.00000000e+00   1.00000000e+00 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 [  1.00000000e+00   0.00000000e+00   1.00000000e+00 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 [  1.00000000e+00   1.00000000e+00  -2.22044605e-16 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 ..., 
 [  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,   0.00000000e+00
    1.00000000e+00   1.00000000e+00]
 [  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,   1.00000000e+00
   -2.22044605e-16   1.00000000e+00]
 [  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]]
1    9310
2     449
0     242
Name: ClusterID, dtype: int64


In [0]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster {} : Words :".format(i))
    for ind in order_centroids[i, :10]: 
        print(' %s' % feature_names[ind])

Cluster 0 : Words :
 canvas art painting
 canvas art
 art painting
 canvas
 painting
 art
 fresh edition canada
 friendly sector role
 fresh edition
 fruity perfect
Cluster 1 : Words :
 solar
 oil
 nuclear
 coal
 energy
 via
 latest
 renewable
 power
 fix
Cluster 2 : Words :
 selection ready solar
 ready solar
 selection ready
 selection
 ready
 solar
 zone oil
 friendly little
 friendly little spa
 friendly little yellow


In [0]:
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

texts = [text for text in cleaned_tweets if len(text) > 2]
doc_clean = [clean(doc).split() for doc in texts]
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
ldamodel = models.ldamodel.LdaModel(doc_term_matrix, num_topics=6, id2word = 
dictionary, passes=5)

for topic in ldamodel.show_topics(num_topics=6, formatted=False, num_words=6):
    print("Topic {}: Words: ".format(topic[0]))
    topicwords = [w for (w, val) in topic[1]]
    print(topicwords)

Using Theano backend.


Topic 0: Words: 
['energy', 'renewable', 'via', 'nuclear', 'oil', 'future']
Topic 1: Words: 
['nuclear', 'oil', 'latest', 'energy', 'thorium', 'climate']
Topic 2: Words: 
['fix', 'contact', 'type', 'oil', 'painting', 'art']
Topic 3: Words: 
['solar', 'ready', 'selection', 'path', 'pipeline', 'hydrogen']
Topic 4: Words: 
['solar', 'coal', 'plant', 'power', 'back', 'wind']
Topic 5: Words: 
['coal', 'mine', 'solar', 'oil', 'mining', 'company']


In [0]:
import gensim
from gensim.models.doc2vec import TaggedDocument
taggeddocs = []
tag2tweetmap = {}
for index,i in enumerate(cleaned_tweets):
    if len(i) > 2: # Non empty tweets
        tag = u'SENT_{:d}'.format(index)
        sentence = TaggedDocument(words=gensim.utils.to_unicode(i).split(), 
tags=[tag])
        tag2tweetmap[tag] = i
        taggeddocs.append(sentence)
        

model = gensim.models.Doc2Vec(taggeddocs, dm=0, alpha=0.025, size=20, 
min_alpha=0.025, min_count=0)
for epoch in range(60):
    if epoch % 20 == 0:
        print('Now training epoch %s' % epoch)
    model.train(taggeddocs, total_examples=model.corpus_count, epochs=model.iter)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

Now training epoch 0
Now training epoch 20
Now training epoch 40


In [0]:
from sklearn.cluster import KMeans

dataSet = model.wv.syn0
kmeansClustering = KMeans(n_clusters=6)
centroidIndx = kmeansClustering.fit_predict(dataSet)
topic2wordsmap = {}
for i, val in enumerate(dataSet):
    tag = model.docvecs.index_to_doctag(i)
    topic = centroidIndx[i]
    if topic in topic2wordsmap.keys():
        for w in (tag2tweetmap[tag].split()):
            topic2wordsmap[topic].append(w)
    else:
        topic2wordsmap[topic] = []
for i in topic2wordsmap:
    words = topic2wordsmap[i]
    print("Topic {} has words {}".format(i, words[:5]))

Topic 0 has words ['nuclear', 'oil', 'brent', 'first', 'try']
Topic 1 has words ['solar', 'would', 'buyer', 'state', 'vermont']
Topic 2 has words ['state', 'except', 'least', 'one', 'plant']
Topic 3 has words ['try', 'like', 'happen', 'every', 'time']
Topic 4 has words ['semiconducting', 'ink', 'capable', 'hydrogen', 'water']
Topic 5 has words ['world', 'potential', 'power', 'energy', 'nuclear']
