In [167]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import re
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [7]:
raw_df = pd.read_csv("judge-1377884607_tweet_product_company.csv")

In [8]:
raw_df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [24]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [35]:
df = raw_df.rename({'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'}, axis=1) \
           .drop('emotion_in_tweet_is_directed_at', axis=1)

In [36]:
df.head()

Unnamed: 0,tweet_text,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [37]:
sentiment_value_codes =   {'No emotion toward brand or product': 0,
                            'Positive emotion': 1,
                            'Negative emotion': -1,
                            "I can't tell": np.nan}

In [38]:
df['sentiment'] = df['sentiment'].map(sentiment_value_codes)

In [41]:
df = df.dropna().astype({'sentiment': 'int8'})

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8936 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  8936 non-null   object
 1   sentiment   8936 non-null   int8  
dtypes: int8(1), object(1)
memory usage: 148.4+ KB


## Text Pre-Processing

In [44]:
# nltk.download("stopwords")
# nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [177]:
def tweet_preprocessing(tweet, stops = stopwords.words('english'), lemmatizer = nltk.stem.WordNetLemmatizer()):
    '''
    Preprocesses a tweet for sentiment analysis using a given list of stop words and lemmatizer
    '''
        
    # Remove case
    lower_tweet = tweet.lower()

    # Tokenize on spaces and apostrophes
    token_tweet = lower_tweet.replace("'", " ").split(" ")
    
    # Define stopword list
    stops = stops + ['sxsw']
    
    # Define regex pattern
    pattern = re.compile('[^a-zA-Z]+') 
    
    # Create processed tweet
    proc_tweet = []
    
    for i in range(len(token_tweet)):
        clean_word = token_tweet[i]
        
        # Remove usernames
        if '@' in clean_word:
            clean_word = ""

        # Keep only characters
        clean_word = pattern.sub('', clean_word)

        # Remove stopwords and words 2 chars or less
        if (clean_word in stops) | (len(clean_word) <= 2):
            clean_word = ""
        
        # Lemmatize
        clean_word = lemmatizer.lemmatize(clean_word)
        
        # Replace original word with clean word if it's not empty
        if clean_word != "":
            proc_tweet.append(clean_word) 
    
    # Return string version of tweet
    return " ".join(proc_tweet)

In [178]:
# Get sample
sample_doc = df.iloc[0].tweet_text

print(tweet_preprocessing(sample_doc))

iphone hr tweeting riseaustin dead need upgrade plugin station


## Count Vectorization

In [179]:
vec = TfidfVectorizer()

In [180]:
tweet_corpus = df['tweet_text'].apply(tweet_preprocessing)

In [181]:
X = vec.fit_transform(tweet_corpus)
y = df['sentiment']

In [182]:
vec_df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
vec_df.head()

Unnamed: 0,aapl,aaron,abacus,abandoned,abba,abc,aber,ability,able,abnormal,...,zms,zomb,zombie,zomg,zone,zoom,zuckerberg,zuckerberglink,zynga,zzzs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modeling

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12345, test_size=0.2)