In [10]:
import pandas as pd
import numpy as np
import random
import pathlib


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

import re
from nltk import pos_tag
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer 

In [14]:
def get_wordnet_pos(treebank_tag):
    '''
    Function takes in a string and assigns it a part of speech tag.
    Used for lemmatizing, no need to use elsewhere
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def product_target(string):
    '''
    Description:
    - Method used for finding a string that the clean function will replace with 'product_target' if it sees any instances of it.
    
    Inputs:
    - Takes in the data from the product column through the string parameter.
    
    Steps:
    1. sets the target variable to lower case
    2. searches what the variable is and return a preset value that we will replace with 'product_target'
    
    Outputs:
    - Returns a string
    '''
    s = string.lower()
    if s == 'no target':
        return ''
    elif s == 'ipad':
        return 'ipad'
    elif s == 'apple':
        return 'apple'
    elif s == 'ipad or iphone app':
        return 'app'
    elif s == 'iphone':
        return 'iphone'
    elif s == 'other apple product or service':
        return ''
    elif s == 'google':
        return 'google'
    elif s == 'other google product or service':
        return ''
    elif s == 'android':
        return 'android'
    elif s == 'android app':
        return 'android'
    else:
        return 'Unknown target'

def txt_clean(txt, lem):
    '''
    Description
    - A method we use to clean every inputed string and prepare it for model processing
    
    Inputs:
    - Takes in a string and returns a cleaned up version of it.
    - Takes in a  boolean variable to determine whether or not the lemmatizing function will be used.
    
    Steps:
    1. Split the tweet into tokens
    2. Convert all capitalized letters into lower-case
    3. Remove punctuation
    4. Remove twitter jargon such as @ mentions
    5. Remove leftover numbers
    6. Remove words with accents
    7. Remove stop words
    8. Replace instances of the target in the text with 'product_target'
    9. Remove empty strings
    10. Lemmatize the words
    11. Rejoin all the tokens into one string
    
    Outputs:
    - A cleaned up string of words ready for model processing.
    '''
    sw = stopwords.words('english')
    sw.extend(['link', 'rt', 'get'])
    punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~“!#'
    no_accents_re = re.compile('^[a-z]+$')
    accents = ['á', 'â', 'ã', 'à', 'å', 'ª', 'ç', 'è', '¼', '¾', 'î', 'ï', 'ì', 'ó', 'ö', 'ð', 'ü', 'ù', 'û', 'ý']
    twitter_re = re.compile('[@][a-zA-Z]*')
    num_re = re.compile('^\d+$')
    
    # splitting the text up into words
    if isinstance(txt, list):
        t = txt[0].split(' ')
    else:
        t = txt.split(' ')
    # turning the words lowercase
    t = [w.lower() for w in t]
    # removing punctuation
    t = [w.translate(w.maketrans('','', punctuation)) for w in t]
    # removing @'s which are twitter jargon
    t = [w for w in t if not twitter_re.match(w)]
    # removing leftover numbers
    t = [w for w in t if not num_re.match(w)]
    # removing words with accents
    t = [w for w in t if no_accents_re.match(w)]
    # removing stop words and more twitter jargon
    t = [w for w in t if w not in sw]
    # change targets in string to 'product_target' if a target exists
    if isinstance(txt, list):
        t = ['product_target' if w in product_target(txt[1]) else w for w in t]
        if txt[1].lower() in ['android app', 'ipad or iphone app']:
            t = [w for w in t if w is not 'app']
    # removing empty strings
    t = [w for w in t if w]
    # word lemmatizing
    if lem: 
        lemm = WordNetLemmatizer()
        t = pos_tag(t)
        t = [(w[0], get_wordnet_pos(w[1])) for w in t]
        t = [lemm.lemmatize(w[0], w[1]) for w in t]
    # joining all the strings together into one
    return ' '.join(t)

def emotion_label(string):
    '''
    Description:
    - Simple mapping function used to turn our targets from words into a numerical value
    
    Inputs:
    - A string
    
    Steps:
    1. Matches the input string with a preset value, positive is a 2, negative is a 0, and neutral is a 1
    
    Output:
    - The corresponding integer value
    '''
    s = string
    if s == 'Positive emotion':
        return 2
    elif s == 'No emotion toward brand or product':
        return 1
    elif s == 'Negative emotion':
        return 0
    else:
        print('Unknown emotion')

def df_clean(df = None, lem = True):
    '''
    Description:
    - A function that returns a cleaned up dataframe.
    
    Inputs:
    - It can take a dataframe, if no dataframe is passed through it will use our default one.
    - It can also set a boolean value, it is used as a flag to determine if lemmatizing will be used.
    
    Steps:
    1. A dataframe will be either created or brought in from an external source determined by the parameter
    2. The dataframe will then be adjusted to make reading easier
    3. NaN values and values we don't want will be removed
    4. We will use the emotion_label to change our emotion values into numerical
    5. We will use the txt_clean function on the tweet text for every row 
    6. Lastly we will drop unused columns
    
    Outputs:
    - A cleaned dataframe that we will use for our models
    '''
    if df is None:
        df = pd.read_csv('../../data/judge-1377884607_tweet_product_company.csv', encoding = 'latin1')
    df.columns = ['text', 'product', 'emotion']
    df = df[df['emotion'] != 'I can\'t tell']
    df.dropna(inplace = True)
    df['text_product'] = df.apply(lambda x: list([x['text'], x['product']]), axis = 1)
    df['emotion'] = df['emotion'].map(emotion_label)
    df['txt_cleaned'] = df['text_product'].apply(txt_clean, args = (lem,))
    df.drop(columns = ['text', 'product', 'text_product'], inplace = True)
    return df


def external_data(lem = True):
    '''
    Description:
    - A function that gives us the cleaned up dataframes of our three external datasets
    - It will be cleaned in a way similar to our base dataset
    
    Inputs:
    - A boolean variable that will determine whether or not a lemmatizer is used or not
    
    Steps:
    1. Reads in the data from our datasets to create three dfs
    2. For each df, grab the columns that we care about and rename them to match our base data
    3. We then map our emotions column in a way that matches our base data (0, 1, and 2 variables)
    4. Use our txt_clean function to clean up the texts in the dataset
    5. Drop all unused columns
    6. Concat all three together
    
    Outputs:
    - A big dataframe containing cleaned up version of all the tweets from external sources
    - Ready to be added directly to the end of our base data
    '''

    df_1 = pd.read_csv('../../data/Apple-Twitter-Sentiment-DFE.csv', encoding = 'latin1')
    df_1 = df_1[['sentiment', 'text']]
    df_1.columns = ['emotion', 'text']
    dic_1 = {'5': 2, '3' : 1, '1': 0}
    df_1.replace({'emotion': dic_1}, inplace = True)
    df_1['txt_cleaned'] = df_1['text'].apply(txt_clean, args = (lem,))
    df_1.drop('text', axis = 1, inplace = True)

    df_2 = pd.read_csv('../../data/Deflategate-DFE.csv', encoding = 'latin1')
    df_2 = df_2[['deflate_sentiment', 'text']]
    df_2.columns = ['emotion', 'text']
    dic_2 = {'positive': 2, 'slightly positive': 2, ('neutral') : 1, 'negative': 0, 'slightly negative': 0}
    df_2.replace({'emotion': dic_2}, inplace = True)
    df_2['txt_cleaned'] = df_2['text'].apply(txt_clean, args = (lem,))
    df_2.drop('text', axis = 1, inplace = True)

    df_3 = pd.read_csv('../../data/Coachella-2015-2-DFE.csv', encoding = 'latin1')
    df_3 = df_3[['coachella_sentiment', 'text']]
    df_3.columns = ['emotion', 'text']
    df_3 = df_3[df_3['emotion'] != 'cant tell']
    dic_3 = {'positive': 2, 'neutral' : 1, 'negative': 0}
    df_3.replace({'emotion': dic_3}, inplace = True)
    df_3['txt_cleaned'] = df_3['text'].apply(txt_clean, args = (lem,))
    df_3.drop('text', axis = 1, inplace = True)

    return pd.concat([df_1, df_2, df_3])

In [16]:
df = df_clean()

In [15]:
ef = external_data()

In [17]:
df = pd.concat([df, ef])

In [18]:
df.shape

(22746, 2)

In [22]:
# divides data into X and y, and then turns the model target labels into numerical format

X = df['txt_cleaned']
y = df['emotion'].replace(to_replace = {'Positive emotion' : 0, 'Negative emotion' : 1, 'No emotion toward brand or product': 2})

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.25)
X_t, X_val, y_t, y_val = train_test_split(X, y, random_state = 42, test_size = 0.25)

In [59]:
class Vectorizer:
    def __init__(self, vec_type, ngram = (1,1)):
        if type(ngram) is not tuple:
            print('Unknown tuple, format should be (minimum n-gram, maximum n-gram)')
            return False
        
        if vec_type == 'cv':
            self.vec = CountVectorizer(ngram_range = ngram)
        elif vec_type == 'tfidf':
            self.vec = TfidfVectorizer(ngram_range = ngram)
        else:
            print('Unknown vectorizer type')
            return False
        
    def fit(self, X, y = None):
        self.vec.fit(X)

    def transform(self, X, y):
        X_vec = self.vec.transform(X)
        X_vec = pd.DataFrame.sparse.from_spmatrix(X_vec)
        X_vec.columns = sorted(self.vec.vocabulary_)
        X_vec.set_index(y.index, inplace = True)
        return X_vec
    
    def fit_transform(self, X, y):
        self.vec.fit(X)
        X_vec = self.vec.transform(X)
        X_vec = pd.DataFrame.sparse.from_spmatrix(X_vec)
        X_vec.columns = sorted(self.vec.vocabulary_)
        X_vec.set_index(y.index, inplace = True)
        return X_vec

## Count Vectorizer

In [56]:
cv = CountVectorizer(ngram_range = (1,1))
X_t_vec = cv.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

store      1113
new         802
austin      688
launch      620
app         598
amp         550
social      481
circle      468
popup       427
today       422
android     419
open        359
network     355
go          350
line        335
via         321
dtype: int64

In [14]:
cv = CountVectorizer(ngram_range = (2,2))
X_t_vec = cv.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

social network     338
popup store        325
new social         308
network call       239
call circle        222
major new          220
launch major       213
temporary store    187
possibly today     179
circle possibly    170
downtown austin    138
ûï mention         134
marissa mayer      132
store downtown     132
store austin       130
open popup         121
dtype: int64

In [201]:
cv = CountVectorizer(ngram_range = (3,3))
X_t_vec = cv.fit_transform(X_t)
X_t_vec  = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_t.index, inplace=True)

X_val_vec = cv.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

new social network          288
social network call         238
major new social            219
network call circle         215
launch major new            213
call circle possibly        168
circle possibly today       168
store downtown austin       117
open temporary store        108
temporary store downtown     77
popup store austin           64
open popup store             63
open popup shop              56
downtown austin launch       55
launch new social            49
rumor open temporary         48
dtype: int64

## Tfidf Vectorizer

In [202]:
tfidf = TfidfVectorizer(ngram_range = (1,1))
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = tfidf.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

store      232.754934
new        164.683823
launch     151.630980
austin     139.152442
app        124.349935
social     121.266819
popup      117.843415
circle     114.139122
today      107.380222
open       107.017833
amp        104.570787
network    103.692296
via         90.282673
line        87.238168
call        85.940698
go          80.454228
dtype: float64

In [203]:
tfidf = TfidfVectorizer(ngram_range = (2,2))
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = tfidf.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.mean(axis = 0).sort_values(ascending = False)[:16]

social network     0.012871
new social         0.012075
popup store        0.011281
network call       0.010188
call circle        0.009623
major new          0.009447
launch major       0.009276
possibly today     0.008182
temporary store    0.008133
circle possibly    0.007913
store austin       0.006158
downtown austin    0.005948
open popup         0.005938
store downtown     0.005905
open temporary     0.005599
ûï mention         0.004758
dtype: float64

In [204]:
tfidf = TfidfVectorizer(ngram_range = (3,3))
X_t_vec = tfidf.fit_transform(X_t)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(tfidf.vocabulary_)
X_t_vec.set_index(y_t.index, inplace = True)

X_val_vec = tfidf.transform(X_val)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(tfidf.vocabulary_)
X_val_vec.set_index(y_val.index, inplace=True)

X_t_vec.sum(axis = 0).sort_values(ascending = False)[:16]

new social network          80.137694
social network call         69.935329
network call circle         65.313997
major new social            64.712864
launch major new            63.743781
call circle possibly        54.022252
circle possibly today       54.005559
open temporary store        38.100215
store downtown austin       36.312710
temporary store downtown    27.969892
popup store austin          24.687708
open popup store            24.318210
downtown austin launch      23.002409
open popup shop             21.364177
launch new social           20.361676
rumor open temporary        19.746143
dtype: float64

In [25]:
X_t_vec

Unnamed: 0,aapl,aaron,ab,abacus,abba,aber,ability,able,abnormal,abound,...,zlf,zms,zomb,zombie,zomg,zone,zoom,zuckerberg,zynga,zzzs
3363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3204,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2311,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6298,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5488,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
873,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# count or tfidf
# n-gram range
# data