# CSML1010 Project Working Copy
# Sentiment Analysis with the Sentiment140 dataset
## Pete Gray

---------------------------------------------------------

# Import libraries

In [36]:
import pandas as pd
import numpy as np
np.set_printoptions(precision=2, linewidth=80)
import warnings
warnings.filterwarnings("ignore")
import model_evaluation_utils as meu


# Adjust pandas display

In [2]:
pd.options.display.max_columns = 30
pd.options.display.max_rows = 100
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.precision = 2
pd.options.display.max_colwidth = -1

# Import matplotlib and seaborn and adjust defaults

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from matplotlib import pyplot as plt
plt.rcParams['figure.dpi'] = 100

import seaborn as sns
sns.set_style("whitegrid")

## Read data from local filesystem and csv source 

In [4]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1")

Check data with quick visual inspection

In [5]:
df

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best feeling ever
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interviews! â« http://blip.fm/~8bmta
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me for details
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur


## Give dataframe columns

In [6]:
df.columns = ['sentiment', 'ID', 'Time', 'none', 'username', 'Text']

In [7]:
df.count()

sentiment    1599999
ID           1599999
Time         1599999
none         1599999
username     1599999
Text         1599999
dtype: int64

## Now it has columns, this seems better.
#
## We have to cut this down to size, for iterative development.
##
## Don't forget to get rid of this!!! When crunching whole huge dataset.

In [8]:
df = df[750000:850000]
df.count()

sentiment    100000
ID           100000
Time         100000
none         100000
username     100000
Text         100000
dtype: int64

In [9]:
columns = [col for col in df.columns if not col.startswith('self')]
columns

['sentiment', 'ID', 'Time', 'none', 'username', 'Text']

In [10]:
raw_text = np.array(df['Text'])
sentiments = np.array(df['sentiment'])
raw_text[5:15]

array(['Back to office to empty aircon water tank  empty office gives too much time for reflection',
       '@ToxicMelvin Too late  However it works now. Am really happy!',
       "@exljbris it can't connect ",
       'Missing my 20yr old baby-moved to WA. ',
       "@SaulaSmurf How old's ur bro?? mine was 15 when it happened... he looked horrible after the accident ",
       'I miss caitlin already ',
       "I'm bored at the dr's office for my mommy. And I miss my jeremy. He works far away now. ",
       'nothing on fucking tv to watch. i hate not having my fucking iPod or iMac or my god damn fucking phone. im falling the fuck apart. ',
       "I'm gonna have to give away my dog.  &lt;3",
       'I forgot to eat cookies at the barn today  Travesty!'],
      dtype=object)

## Connect to database

In [11]:
#import sqlite3
#con = sqlite3.connect('documents.db')

## Save subreddit category info

In [12]:
#pd.read_csv("testdata.manual.2009.06.14.csv").to_sql("sentiment-labels", con)

# Data Cleaning

## Cleaning function

In [14]:
import re
def clean(s):
    s = s.replace(r'<lb>', "\n")
    s = s.replace(r'<tab>', "\i")
    s = re.sub(r'<br */*>', "\n", s)
    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
    s = s.replace("&amp;", "&")
    # markdown urls
    s = re.sub(r'\(https*://[^\)]*\)', "", s)
    # normal urls
    s = re.sub(r'https*://[^\s]*', "", s)
    s = re.sub(r'_+', ' ', s)
    s = re.sub(r'"+', '"', s)
    return str(s)

## Create new column in dataframe

In [15]:
df["selftext_clean"] = ''

# Iterate and clean

In [16]:
for i, row in df.iterrows():
    if i % 10000 == 0:
            print('processed:'.format(i), i)
    df.at[i, "selftext_clean"] = clean(row.Text)

processed: 750000
processed: 760000
processed: 770000
processed: 780000
processed: 790000
processed: 800000
processed: 810000
processed: 820000
processed: 830000
processed: 840000


Check results

In [17]:
df.head()

Unnamed: 0,sentiment,ID,Time,none,username,Text,selftext_clean
750000,0,2285370823,Mon Jun 22 15:02:49 PDT 2009,NO_QUERY,xbeautifulmessx,@Idristwilight You can post HAN when you want. It's great! I am still working on TLD though. I got a little distracted so sorry.,@Idristwilight You can post HAN when you want. It's great! I am still working on TLD though. I got a little distracted so sorry.
750001,0,2285371185,Mon Jun 22 15:02:51 PDT 2009,NO_QUERY,thefirstsight,"@rose_7 Ohh poor jan please tell her that if she cans, send us an email!!","@rose 7 Ohh poor jan please tell her that if she cans, send us an email!!"
750002,0,2285371495,Mon Jun 22 15:02:52 PDT 2009,NO_QUERY,Sarah2713,Finally home from work...It was a looong day!! And it's only Monday,Finally home from work...It was a looong day!! And it's only Monday
750003,0,2285371762,Mon Jun 22 15:02:54 PDT 2009,NO_QUERY,dierockerfrau,im very sad 4 chantelle and tom,im very sad 4 chantelle and tom
750004,0,2285372377,Mon Jun 22 15:02:57 PDT 2009,NO_QUERY,alexbates,I chatted with someone on the online Apple store and they said it would be better to buy a new one. I don't have $200 to waste,I chatted with someone on the online Apple store and they said it would be better to buy a new one. I don't have $200 to waste


In [21]:
# Sentiment Analysis with Afinn 

In [22]:
from afinn import Afinn

afn = Afinn(emoticons=True)

In [47]:
texts = np.array(df['Text'])
sentiments = np.array(df['sentiment'])

# extract data for model evaluation
train_texts = texts[:10000]
train_sentiments = sentiments[:10000]

test_texts = texts[45000:55000]
test_sentiments = sentiments[45000:55000]
sample_review_ids = [5626, 3533, 6010, 123, 2654, 4000]

In [48]:
for Text, sentiment in zip(test_texts[sample_review_ids], test_sentiments[sample_review_ids]):
    print('TEXT:', Text)
    print('Actual Sentiment:', sentiment)
    print('Predicted Sentiment polarity:', afn.score(Text))
    print('-'*60)

TEXT: @Lee_Knight lmao! thanks Lee XD, would u like to join in our craziness as well lolol  ROFL come &amp; join we don't bite.. not hard anyway! ;)
Actual Sentiment: 4
Predicted Sentiment polarity: 19.0
------------------------------------------------------------
TEXT: @ItsJayRabBaby Oh no!!  RIP
Actual Sentiment: 0
Predicted Sentiment polarity: -1.0
------------------------------------------------------------
TEXT: I got sunburn today haha. i was outside the wholeee day SKATINGGG 
Actual Sentiment: 4
Predicted Sentiment polarity: 3.0
------------------------------------------------------------
TEXT: he stuck on my mind arrgh! please dont treat me like that 
Actual Sentiment: 0
Predicted Sentiment polarity: 1.0
------------------------------------------------------------
TEXT: I want my Mini 10v now....waiting takes too long. 
Actual Sentiment: 0
Predicted Sentiment polarity: 1.0
------------------------------------------------------------
TEXT: Charlie lost an angel today   Very sad 

In [51]:
# Predict sentiment with Afinn


sentiment_polarity = [afn.score(Text) for Text in test_texts]
#predicted_sentiments = ['positive' if score >= 1.0 else 'negative' for score in sentiment_polarity]
predicted_sentiments = [4 if score >= 1.0 else 0 for score in sentiment_polarity]


In [52]:
#meu.display_model_performance_metrics(true_labels=test_texts, predicted_labels=predicted_sentiments, 
#                                  classes=['positive', 'negative'])
meu.display_model_performance_metrics(true_labels=test_texts, predicted_labels=predicted_sentiments, 
                                  classes=[4, 0])

Model Performance metrics:
------------------------------
Accuracy: 0.0


ValueError: Mix of label input types (string and number)

#### NLP

## Load spaCy

In [18]:
import spacy
nlp = spacy.load('en_core_web_sm')

## Iterate over all rows and perform NLP

In [19]:
for i, row in df.iterrows():
    if i % 1000 == 0:
        print(i)
    if(row["selftext_clean"] and len(str(row["selftext_clean"])) < 1000000):
        doc = nlp(str(row["selftext_clean"]))
        adjectives = []
        nouns = []
        verbs = []
        lemmas = []

        for token in doc:
            lemmas.append(token.lemma_)
            if token.pos_ == "ADJ":
                adjectives.append(token.lemma_)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                nouns.append(token.lemma_)
            if token.pos_ == "VERB":
                verbs.append(token.lemma_)
                
        df.at[i, "selftext_lemma"] = " ".join(lemmas)                
        df.at[i, "selftext_nouns"] = " ".join(nouns)
        df.at[i, "selftext_adjectives"] = " ".join(adjectives)
        df.at[i, "selftext_verbs"] = " ".join(verbs)
        df.at[i, "selftext_nav"] = " ".join(nouns+adjectives+verbs)
        df.at[i, "no_tokens"] = len(lemmas)

750000
751000
752000
753000
754000
755000
756000
757000
758000
759000
760000
761000
762000
763000


KeyboardInterrupt: 

## Check results

In [20]:
df.head()

Unnamed: 0,sentiment,ID,Time,none,username,Text,selftext_clean,selftext_lemma,selftext_nouns,selftext_adjectives,selftext_verbs,selftext_nav,no_tokens
750000,0,2285370823,Mon Jun 22 15:02:49 PDT 2009,NO_QUERY,xbeautifulmessx,@Idristwilight You can post HAN when you want. It's great! I am still working on TLD though. I got a little distracted so sorry.,@Idristwilight You can post HAN when you want. It's great! I am still working on TLD though. I got a little distracted so sorry.,@Idristwilight -PRON- can post HAN when -PRON- want . -PRON- be great ! -PRON- be still work on TLD though . -PRON- get a little distracted so sorry .,@Idristwilight HAN TLD,great distracted sorry,can post want work get,@Idristwilight HAN TLD great distracted sorry can post want work get,29.0
750001,0,2285371185,Mon Jun 22 15:02:51 PDT 2009,NO_QUERY,thefirstsight,"@rose_7 Ohh poor jan please tell her that if she cans, send us an email!!","@rose 7 Ohh poor jan please tell her that if she cans, send us an email!!","@rose 7 Ohh poor jan please tell -PRON- that if -PRON- can , send -PRON- an email ! !",Ohh jan email,poor,tell can send,Ohh jan email poor tell can send,20.0
750002,0,2285371495,Mon Jun 22 15:02:52 PDT 2009,NO_QUERY,Sarah2713,Finally home from work...It was a looong day!! And it's only Monday,Finally home from work...It was a looong day!! And it's only Monday,finally home from work ... -PRON- be a looong day ! ! and -PRON- be only Monday,work day Monday,looong,,work day Monday looong,17.0
750003,0,2285371762,Mon Jun 22 15:02:54 PDT 2009,NO_QUERY,dierockerfrau,im very sad 4 chantelle and tom,im very sad 4 chantelle and tom,-PRON- be very sad 4 chantelle and tom,chantelle tom,sad,be,chantelle tom sad be,8.0
750004,0,2285372377,Mon Jun 22 15:02:57 PDT 2009,NO_QUERY,alexbates,I chatted with someone on the online Apple store and they said it would be better to buy a new one. I don't have $200 to waste,I chatted with someone on the online Apple store and they said it would be better to buy a new one. I don't have $200 to waste,-PRON- chat with someone on the online Apple store and -PRON- say -PRON- would be well to buy a new one . -PRON- do not have $ 200 to waste,Apple store,online well new,chat say would buy waste,Apple store online well new chat say would buy waste,31.0


## Save to database

In [None]:
df.to_sql('posts_nlp', con)