In [4]:
import pandas as pd
import numpy as np
import random

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

import re
from nltk import pos_tag
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer 


In [11]:
 def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def product_target(string):
    s = string.lower()
    if s == 'no target':
        return ''
    elif s == 'ipad':
        return 'ipad'
    elif s == 'apple':
        return 'apple'
    elif s == 'ipad or iphone app':
        return 'app'
    elif s == 'iphone':
        return 'iphone'
    elif s == 'other apple product or service':
        return ''
    elif s == 'google':
        return 'google'
    elif s == 'other google product or service':
        return ''
    elif s == 'android':
        return 'android'
    elif s == 'android app':
        return 'android'
    else:
        return 'Unknown target'

def txt_clean(txt):
    # takes in a string and returns a cleaned up string ready for count or tfidf vectorizing
    sw = stopwords.words('english')
    sw.extend(['link', 'rt', 'get'])
    punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~“!#'
    no_accents_re = re.compile('^[a-z]+$')
    accents = ['á', 'â', 'ã', 'à', 'å', 'ª', 'ç', 'è', '¼', '¾', 'î', 'ï', 'ì', 'ó', 'ö', 'ð', 'ü', 'ù', 'û', 'ý']
    twitter_re = re.compile('[@][a-zA-Z]*')
    num_re = re.compile('^\d+$')
    
    # splitting the text up into words
    t = txt[0].split(' ')
    # turning the words lowercase
    t = [w.lower() for w in t]
    # removing punctuation
    t = [w.translate(w.maketrans('','', punctuation)) for w in t]
    # removing @'s which are twitter jargon
    t = [w for w in t if not twitter_re.match(w)]
    # removing leftover numbers
    t = [w for w in t if not num_re.match(w)]
    # removing words with accents
    t = [w for w in t if no_accents_re.match(w)]
    # removing stop words and more twitter jargon
    t = [w for w in t if w not in sw]
    # change targets in string to 'product_target'
    t = ['product_target' if w == product_target(txt[1]) else w for w in t]
    # removing empty strings
    t = [w for w in t if w]
    # word lemmatizing
    t = pos_tag(t)
    t = [(w[0], get_wordnet_pos(w[1])) for w in t]
    lem = WordNetLemmatizer()
    if lem: t = [lem.lemmatize(w[0], w[1]) for w in t]
    # joining all the strings together into one
    return ' '.join(t)

def emotion_label(string):
    s = string
    if s == 'Positive emotion':
        return 2
    elif s == 'No emotion toward brand or product':
        return 1
    elif s == 'Negative emotion':
        return 0
    else:
        print('Unknown emotion')

def df_clean(lem = True):
    df = pd.read_csv('../../data/judge-1377884607_tweet_product_company.csv', encoding = 'latin1')
    df.columns = ['text', 'product', 'emotion']
    df = df[df['emotion'] != 'I can\'t tell']
    df.dropna(inplace = True)
    print(df['product'].value_counts())
    df['text_product'] = df.apply(lambda x: list([x['text'], x['product']]), axis = 1)
    df['emotion'] = df['emotion'].map(emotion_label)
    df['txt_cleaned'] = df['text_product'].apply(txt_clean)
    df.drop(columns = ['text', 'product', 'text_product'], inplace = True)
    return df

In [12]:
df_clean()

iPad                               942
Apple                              659
iPad or iPhone App                 470
Google                             429
iPhone                             296
Other Google product or service    292
Android App                         81
Android                             78
Other Apple product or service      35
Name: product, dtype: int64


Unnamed: 0,emotion,txt_cleaned
0,0,product_target hr tweet riseaustin dead need u...
1,2,know awesome ipadiphone product_target youll l...
2,2,wait product_target also sale sxsw
3,0,hope year festival isnt crashy year iphone pro...
4,2,great stuff fri sxsw marissa mayer product_tar...
...,...,...
9077,2,pr guy convince switch back product_target gre...
9079,2,quotpapyrussort like ipadquot nice lol sxsw la...
9080,0,diller say google tv quotmight run playstation...
9085,2,ive always use camera iphone bc image stabiliz...


In [22]:
# divides data into X and y, and then turns the model target labels into numerical format

X = df['txt_cleaned']
y = df['emotion'].replace(to_replace = {'Positive emotion' : 0, 'Negative emotion' : 1, 'No emotion toward brand or product': 2})

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)
X_t, X_val, y_t, y_val = train_test_split(X, y, random_state = 42, test_size = 0.25)

In [59]:
class Vectorizer:
    def __init__(self, vec_type, ngram = (1,1)):
        if type(ngram) is not tuple:
            print('Unknown tuple, format should be (minimum n-gram, maximum n-gram)')
            return False
        
        if vec_type == 'cv':
            self.vec = CountVectorizer(ngram_range = ngram)
        elif vec_type == 'tfidf':
            self.vec = TfidfVectorizer(ngram_range = ngram)
        else:
            print('Unknown vectorizer type')
            return False
        
    def fit(self, X, y = None):
        self.vec.fit(X)

    def transform(self, X, y):
        X_vec = self.vec.transform(X)
        X_vec = pd.DataFrame.sparse.from_spmatrix(X_vec)
        X_vec.columns = sorted(self.vec.vocabulary_)
        X_vec.set_index(y.index, inplace = True)
        return X_vec
    
    def fit_transform(self, X, y):
        self.vec.fit(X)
        X_vec = self.vec.transform(X)
        X_vec = pd.DataFrame.sparse.from_spmatrix(X_vec)
        X_vec.columns = sorted(self.vec.vocabulary_)
        X_vec.set_index(y.index, inplace = True)
        return X_vec

## Count Vectorizer

In [56]:
cv = CountVectorizer(ngram_range = (1,1))
X_t_vec = cv.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

store      1113
new         802
austin      688
launch      620
app         598
amp         550
social      481
circle      468
popup       427
today       422
android     419
open        359
network     355
go          350
line        335
via         321
dtype: int64

In [14]:
cv = CountVectorizer(ngram_range = (2,2))
X_t_vec = cv.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

social network     338
popup store        325
new social         308
network call       239
call circle        222
major new          220
launch major       213
temporary store    187
possibly today     179
circle possibly    170
downtown austin    138
ûï mention         134
marissa mayer      132
store downtown     132
store austin       130
open popup         121
dtype: int64

In [201]:
cv = CountVectorizer(ngram_range = (3,3))
X_t_vec = cv.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

new social network          288
social network call         238
major new social            219
network call circle         215
launch major new            213
call circle possibly        168
circle possibly today       168
store downtown austin       117
open temporary store        108
temporary store downtown     77
popup store austin           64
open popup store             63
open popup shop              56
downtown austin launch       55
launch new social            49
rumor open temporary         48
dtype: int64

## Tfidf Vectorizer

In [202]:
tfidf = TfidfVectorizer(ngram_range = (1,1))
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = tfidf.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

store      232.754934
new        164.683823
launch     151.630980
austin     139.152442
app        124.349935
social     121.266819
popup      117.843415
circle     114.139122
today      107.380222
open       107.017833
amp        104.570787
network    103.692296
via         90.282673
line        87.238168
call        85.940698
go          80.454228
dtype: float64

In [203]:
tfidf = TfidfVectorizer(ngram_range = (2,2))
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = tfidf.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.mean(axis = 0).sort_values(ascending = False)[:16]

social network     0.012871
new social         0.012075
popup store        0.011281
network call       0.010188
call circle        0.009623
major new          0.009447
launch major       0.009276
possibly today     0.008182
temporary store    0.008133
circle possibly    0.007913
store austin       0.006158
downtown austin    0.005948
open popup         0.005938
store downtown     0.005905
open temporary     0.005599
ûï mention         0.004758
dtype: float64

In [204]:
tfidf = TfidfVectorizer(ngram_range = (3,3))
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = tfidf.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

new social network          80.137694
social network call         69.935329
network call circle         65.313997
major new social            64.712864
launch major new            63.743781
call circle possibly        54.022252
circle possibly today       54.005559
open temporary store        38.100215
store downtown austin       36.312710
temporary store downtown    27.969892
popup store austin          24.687708
open popup store            24.318210
downtown austin launch      23.002409
open popup shop             21.364177
launch new social           20.361676
rumor open temporary        19.746143
dtype: float64

In [25]:
X_t_vec

Unnamed: 0,aapl,aaron,ab,abacus,abba,aber,ability,able,abnormal,abound,...,zlf,zms,zomb,zombie,zomg,zone,zoom,zuckerberg,zynga,zzzs
3363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3204,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2311,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6298,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5488,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
873,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# count or tfidf
# n-gram range
# data