In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import re
from pre_processing import tweet_preprocessing

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.dummy import DummyClassifier

import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
__name__

'__main__'

In [2]:
raw_df = pd.read_csv("judge-1377884607_tweet_product_company.csv")

In [3]:
raw_df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [5]:
df = raw_df.rename({'is_there_an_emotion_directed_at_a_brand_or_product': 'sentiment'}, axis=1) \
           .drop('emotion_in_tweet_is_directed_at', axis=1)

In [6]:
df.head()

Unnamed: 0,tweet_text,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [7]:
sentiment_value_codes =   {'No emotion toward brand or product': 0,
                            'Positive emotion': 1,
                            'Negative emotion': -1,
                            "I can't tell": np.nan}

In [8]:
df['sentiment'] = df['sentiment'].map(sentiment_value_codes)

In [9]:
df = df.dropna().astype({'sentiment': 'int8'})

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8936 entries, 0 to 9092
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  8936 non-null   object
 1   sentiment   8936 non-null   int8  
dtypes: int8(1), object(1)
memory usage: 148.4+ KB


## Text Pre-Processing

In [11]:
# nltk.download("stopwords")
# nltk.download('wordnet')

In [12]:
# Get sample
sample_doc = df.iloc[0].tweet_text

print(tweet_preprocessing(sample_doc))

iphone hr tweeting riseaustin dead need upgrade plugin station


In [13]:
tweet_corpus = df['tweet_text'].apply(tweet_preprocessing)

## Count Vectorization

In [14]:
vec = CountVectorizer(min_df = 20)

In [15]:
X = vec.fit_transform(tweet_corpus)
y = df['sentiment']

In [16]:
vec_df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
vec_df.head()

Unnamed: 0,able,access,aclu,action,actually,air,almost,already,also,always,...,work,working,world,would,wow,year,yes,yesterday,yet,zazzlesxsw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
filename = 'pickled_vectorizer.sav'
pickle.dump(vec, open(filename, 'wb'))

# Modeling

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12345, test_size=0.2)

In [18]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, random_state=6789, test_size=0.2)

In [19]:
scoring = ['precision_macro', 'recall_macro', 'accuracy']

## Baseline Dummy

In [20]:
dummy = DummyClassifier()

In [21]:
cv_dummy = cross_validate(dummy, X_train, y_train, cv=3, scoring = scoring)
print(cv_dummy)

{'fit_time': array([0.00099635, 0.00099707, 0.0009973 ]), 'score_time': array([0.00269938, 0.00199461, 0.00099707]), 'test_precision_macro': array([0.19960834, 0.19974822, 0.19983207]), 'test_recall_macro': array([0.33333333, 0.33333333, 0.33333333]), 'test_accuracy': array([0.59882501, 0.59924465, 0.59949622])}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
dummy.fit(X_tr, y_tr)
y_val_pred = dummy.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        92
           0       0.59      1.00      0.74       845
           1       0.00      0.00      0.00       493

    accuracy                           0.59      1430
   macro avg       0.20      0.33      0.25      1430
weighted avg       0.35      0.59      0.44      1430



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Multinomial Naive Bayes

In [29]:
mnb = MultinomialNB()

In [30]:
cv_mnb = cross_validate(mnb, X_train, y_train, cv=3, scoring = scoring)
print(cv_mnb)

{'fit_time': array([0.00199699, 0.00099707, 0.0009973 ]), 'score_time': array([0.00199485, 0.00202918, 0.00199461]), 'test_precision_macro': array([0.51962236, 0.48610076, 0.5093112 ]), 'test_recall_macro': array([0.51578429, 0.4830763 , 0.50384096]), 'test_accuracy': array([0.64456567, 0.61225346, 0.62384551])}


In [31]:
mnb.fit(X_tr, y_tr)
y_val_pred = mnb.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.30      0.36      0.33        92
           0       0.72      0.73      0.72       845
           1       0.57      0.54      0.55       493

    accuracy                           0.64      1430
   macro avg       0.53      0.54      0.53      1430
weighted avg       0.64      0.64      0.64      1430



# Final Validation

In [35]:
final_model = mnb
final_model.fit(X_train, y_train)
y_test_pred = final_model.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.20      0.26      0.22        80
           0       0.73      0.74      0.73      1105
           1       0.55      0.50      0.52       603

    accuracy                           0.64      1788
   macro avg       0.49      0.50      0.49      1788
weighted avg       0.64      0.64      0.64      1788



## Pickle Model

In [36]:
filename = 'pickled_model.sav'
pickle.dump(final_model, open(filename, 'wb'))
 
# loaded_model = pickle.load(open(filename, 'rb'))

## Testing

In [44]:
raw_tweet = "I love cheese"
prepped_tweet = tweet_preprocessing(raw_tweet)
vectorized_tweet = vec.transform([prepped_tweet])
prediction = final_model.predict(vectorized_tweet)
prediction

array([1], dtype=int8)