## EDA:

Today we're looking at tweet data to find sentiment towards a product or brand.

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize

In [2]:
#Import Twitter Data
data = pd.read_csv('Data/Twitter_sentiment.csv', encoding='latin1')
data

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [3]:
data = data.rename(columns={"tweet_text": "Tweet", "emotion_in_tweet_is_directed_at": "Subject_of_tweet",
                     "is_there_an_emotion_directed_at_a_brand_or_product": "Emotion"})
data

Unnamed: 0,Tweet,Subject_of_tweet,Emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [4]:
#Looking for null values
data.isna().sum()

Tweet                  1
Subject_of_tweet    5802
Emotion                0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
Tweet               9092 non-null object
Subject_of_tweet    3291 non-null object
Emotion             9093 non-null object
dtypes: object(3)
memory usage: 213.2+ KB


In [6]:
data.describe()

Unnamed: 0,Tweet,Subject_of_tweet,Emotion
count,9092,3291,9093
unique,9065,9,4
top,RT @mention Marissa Mayer: Google Will Connect...,iPad,No emotion toward brand or product
freq,5,946,5389


In [7]:
#Taking a look at the null values to see the content
for i in range(9089, 9092):
    print('.............')
    print(data['Tweet'].iloc[i])

.............
Wave, buzz... RT @mention We interrupt your regularly scheduled #sxsw geek programming with big news {link}  #google #circles
.............
Google's Zeiger, a physician never reported potential AE. Yet FDA relies on physicians. &quot;We're operating w/out data.&quot; #sxsw #health2dev
.............
Some Verizon iPhone customers complained their time fell back an hour this weekend.  Of course they were the New Yorkers who attended #SXSW.


In [8]:
#Creating a function for cleaning the tweets
def clean_tweets(tweets):
    cleaned_tweets = []
    for text in tweets:
        for symbol in ",.?!''\n":
            text = text.replace(symbol, '').lower()
        cleaned_tweets.append(text)
    return cleaned_tweets       


In [9]:
#Changing the data type to String so it can be passed through the cleaning function
data['Tweet'] = data['Tweet'].astype('str')

In [10]:
data['Tweet']

0       .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
3       @sxsw I hope this year's festival isn't as cra...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
                              ...                        
9088                        Ipad everywhere. #SXSW {link}
9089    Wave, buzz... RT @mention We interrupt your re...
9090    Google's Zeiger, a physician never reported po...
9091    Some Verizon iPhone customers complained their...
9092    Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...
Name: Tweet, Length: 9093, dtype: object

In [11]:
#Applied the cleaning function and created a new variable
cleaned_tweets = clean_tweets(data['Tweet'])
cleaned_tweets

['@wesley83 i have a 3g iphone after 3 hrs tweeting at #rise_austin it was dead  i need to upgrade plugin stations at #sxsw',
 '@jessedee know about @fludapp  awesome ipad/iphone app that youll likely appreciate for its design also theyre giving free ts at #sxsw',
 '@swonderlin can not wait for #ipad 2 also they should sale them down at #sxsw',
 '@sxsw i hope this years festival isnt as crashy as this years iphone app #sxsw',
 '@sxtxstate great stuff on fri #sxsw: marissa mayer (google) tim oreilly (tech books/conferences) &amp; matt mullenweg (wordpress)',
 '@teachntech00 new ipad apps for #speechtherapy and communication are showcased at the #sxsw conference http://htly/49n4m #iear #edchat #asd',
 'nan',
 '#sxsw is just starting #ctia is around the corner and #googleio is only a hop skip and a jump from there good time to be an #android fan',
 'beautifully smart and simple idea rt @madebymany @thenextweb wrote about our #hollergram ipad app for #sxsw http://bitly/ieavob',
 'counting 

In [12]:
#Created a tokenizing function 
def tokenize(tweet):
    joined_tweet = ' '.join(tweet)
    tokenized_tweet = word_tokenize(joined_tweet)
    
    return tokenized_tweet

In [13]:
#Passed the cleaning varible through the new tokenizing function
tokenized_tweet = tokenize(cleaned_tweets)
tokenized_tweet

['@',
 'wesley83',
 'i',
 'have',
 'a',
 '3g',
 'iphone',
 'after',
 '3',
 'hrs',
 'tweeting',
 'at',
 '#',
 'rise_austin',
 'it',
 'was',
 'dead',
 'i',
 'need',
 'to',
 'upgrade',
 'plugin',
 'stations',
 'at',
 '#',
 'sxsw',
 '@',
 'jessedee',
 'know',
 'about',
 '@',
 'fludapp',
 'awesome',
 'ipad/iphone',
 'app',
 'that',
 'youll',
 'likely',
 'appreciate',
 'for',
 'its',
 'design',
 'also',
 'theyre',
 'giving',
 'free',
 'ts',
 'at',
 '#',
 'sxsw',
 '@',
 'swonderlin',
 'can',
 'not',
 'wait',
 'for',
 '#',
 'ipad',
 '2',
 'also',
 'they',
 'should',
 'sale',
 'them',
 'down',
 'at',
 '#',
 'sxsw',
 '@',
 'sxsw',
 'i',
 'hope',
 'this',
 'years',
 'festival',
 'isnt',
 'as',
 'crashy',
 'as',
 'this',
 'years',
 'iphone',
 'app',
 '#',
 'sxsw',
 '@',
 'sxtxstate',
 'great',
 'stuff',
 'on',
 'fri',
 '#',
 'sxsw',
 ':',
 'marissa',
 'mayer',
 '(',
 'google',
 ')',
 'tim',
 'oreilly',
 '(',
 'tech',
 'books/conferences',
 ')',
 '&',
 'amp',
 ';',
 'matt',
 'mullenweg',
 '(',
 'wo

In [14]:
#Created a count vectorization function 
def count_vectorize(tweet, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(tweet))
    
    tweet_dict = {i:0 for i in unique_words}
    
    for word in tweet:
        tweet_dict[word] += 1
    
    return tweet_dict

test_vectorized = count_vectorize(tokenized_tweet)
print(test_vectorized)



In [15]:
def term_frequency(BoW_dict):
    total_word_count = sum(BoW_dict.values())
    
    for ind, val in BoW_dict.items():
        BoW_dict[ind] = val/ total_word_count
    
    return BoW_dict

test = term_frequency(test_vectorized)

In [16]:
print(list(test)[10:20])

['85', 'witness', 'workspace', 'hitlantiscom', 'jacinto', 'desk', 'cant', 'telework', 'call-girl', 'available']


In [17]:
def inverse_document_frequency(list_of_dicts):
    vocab_set = set()
    # Iterate through list of dfs and add index to vocab_set
    for d in list_of_dicts:
        for word in d.keys():
            vocab_set.add(word)
    
    # Once vocab set is complete, create an empty dictionary with a key for each word and value of 0.
    full_vocab_dict = {i:0 for i in vocab_set}
    
    # Loop through each word in full_vocab_dict
    for word, val in full_vocab_dict.items():
        docs = 0
        
        # Loop through list of dicts.  Each time a dictionary contains the word, increment docs by 1
        for d in list_of_dicts:
            if word in d:
                docs += 1
        
        # Now that we know denominator for equation, compute and set IDF value for word
        
        full_vocab_dict[word] = np.log((len(list_of_dicts)/ float(docs)))
    
    return full_vocab_dict

In [18]:
def tf_idf(list_of_dicts):
    # Create empty dictionary containing full vocabulary of entire corpus
    doc_tf_idf = {}
    idf = inverse_document_frequency(list_of_dicts)
    full_vocab_list = {i:0 for i in list(idf.keys())}
    
    # Create tf-idf list of dictionaries, containing a dictionary that will be updated for each document
    tf_idf_list_of_dicts = []
    
    # Now, compute tf and then use this to compute and set tf-idf values for each document
    for doc in list_of_dicts:
        doc_tf = term_frequency(doc)
        for word in doc_tf:
            doc_tf_idf[word] = doc_tf[word] * idf[word]
        tf_idf_list_of_dicts.append(doc_tf_idf)
    
    return tf_idf_list_of_dicts

In [34]:
def main(data):
    # Iterate through list of filenames and read each in
    count_vectorized_all_documents = []
        # Clean and tokenize raw text
    cleaned = clean_tweets(data)
    tokenized = tokenize(cleaned)
        
        # Get count vectorized representation and store in count_vectorized_all_documents  
    count_vectorized_document = count_vectorize(tokenized)
    count_vectorized_all_documents.append(count_vectorized_document)
    
    # Now that we have a list of BoW respresentations of each song, create a tf-idf representation of everything
    tf_idf_all_docs = tf_idf(count_vectorized_all_documents)
    
    return tf_idf_all_docs

tf_idf_all_docs = main(data['Tweet'][0])
print(list(tf_idf_all_docs[0])[:10])

['p', '_', 't', 'v', 'l', 'o', 'h', 'x', 'u', 'd']


In [20]:
#counting the number of dimensions
num_dims = len(tf_idf_all_docs[0])
print("Number of Dimensions: {}".format(num_dims))

Number of Dimensions: 10763


In [33]:
len(tf_idf_all_docs)

1

In [27]:
#Pulling the words out of the document and turning the remaining values into a list.
tf_idf_vals_list = []

for i in tf_idf_all_docs:
    tf_idf_vals_list.append(list(i.values()))
    
tf_idf_vals_list[0][:10]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]