In [1]:
import pandas as pd 
import numpy as np 
from copy import deepcopy
from string import punctuation
from random import shuffle
import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
from tqdm import tqdm
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
pd.options.mode.chained_assignment = None
tokenizer = TweetTokenizer()
tqdm.pandas(desc="progress-bar")
LabeledSentence = gensim.models.doc2vec.LabeledSentence
n = 2569
n_dim=200

In [3]:
# Import Data
data = pd.read_csv("tweetdata.csv")
data.columns = ['SentimentText', 'Sentiment']
data = data[1::2]
data['SentimentText'] = data[data['SentimentText'].isnull() == False]

In [6]:
def tokenize(tweet):
    tweet = tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet)
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) 
    tokens = tokenizer.tokenize(tweet)
    return tokens


def postprocess(data, n=2659):
    data = data.head(n)
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    #data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

In [7]:
data = postprocess(data)
print(data)

progress-bar: 100%|█████████████████████████████████████████████████████████████| 2659/2659 [00:00<00:00, 15500.86it/s]

                                          SentimentText Sentiment  \
0     Hilarious @youtube video - guy does a duet wit...  positive   
1     @RIM you made it too easy for me to switch to ...  positive   
2     The 16 strangest things Siri has said so far. ...  positive   
3     Great up close & personal event @Apple tonight...  positive   
4     From which companies do you experience the bes...  positive   
...                                                 ...       ...   
2654   #Tweetdeck working but not #Twitter for #Android   neutral   
2655           gud mrng #twitter & all my frnds too. ;)   neutral   
2656  Increase your #twitter followers and or your #...   neutral   
2657  62 Ways to Use #Twitter for Business: http://t...   neutral   
2658  It's almost 4:20. Where is your bong? Is it pa...   neutral   

                                                 tokens  
0     [hilarious, AT_USER, video, -, guy, does, a, d...  
1     [AT_USER, you, made, it, too, easy, for, me, t...




In [8]:
#Dividing into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.25)

In [9]:
#labeling tweets
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

print(x_train[0])
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')
print(x_train[0])

  
1926it [00:00, 71526.10it/s]
643it [00:00, 80599.43it/s]

["i'm", 'loving', 'this', 'new', 'ios', '5', 'update', ':)', 'AT_USER']
LabeledSentence(["i'm", 'loving', 'this', 'new', 'ios', '5', 'update', ':)', 'AT_USER'], ['TRAIN_0'])





In [10]:
#training Word2Vecs
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
xw = []#list(x.words for x in tqdm(x_train))
for x in tqdm(x_train):
    xw.append(x.words)
tweet_w2v.build_vocab(xw)
tweet_w2v.train(xw, total_examples = len(xw), epochs = 10)

100%|█████████████████████████████████████████████████████████████████████████| 1926/1926 [00:00<00:00, 1932128.56it/s]


(141586, 326000)

In [11]:
#defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 2659 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover",
    x_axis_type=None, y_axis_type=None, min_border=1)

In [12]:
word_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:2000]]

  """Entry point for launching an IPython kernel.


In [13]:
# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 419 samples in 0.008s...
[t-SNE] Computed neighbors for 419 samples in 0.057s...
[t-SNE] Computed conditional probabilities for sample 419 / 419
[t-SNE] Mean sigma: 0.062334
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.030647
[t-SNE] KL divergence after 1000 iterations: 0.270081


In [14]:
# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:2000]

In [15]:
# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)