# CSML1010 Project Working Copy
# Sentiment Analysis with the Sentiment140 dataset
## Pete Gray

---------------------------------------------------------

# Import libraries

In [0]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Adjust pandas display

In [0]:
pd.options.display.max_columns = 30
pd.options.display.max_rows = 100
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.precision = 2
pd.options.display.max_colwidth = -1

# Import matplotlib and seaborn and adjust defaults

In [0]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from matplotlib import pyplot as plt
plt.rcParams['figure.dpi'] = 100

import seaborn as sns
sns.set_style("whitegrid")

## Read data from local filesystem and csv source 

In [0]:
df = pd.read_csv("training.50000.processed.noemoticon.csv")

Check data with quick visual inspection

In [62]:
df

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
49995,4,1556381057,Sat Apr 18 22:37:08 PDT 2009,NO_QUERY,ethanattack,Yup. Going to sleep. Night guys.
49996,4,1556381076,Sat Apr 18 22:37:06 PDT 2009,NO_QUERY,GinaLaGuardia,"@MsUnitedStates Thank you so, so, so, so much for supporting us tonight. Looking forward to a new friendship! #gnonyc"
49997,4,1556381131,Sat Apr 18 22:37:09 PDT 2009,NO_QUERY,CarolCalazans,@tommcfly Hey Tom How are you? http://twitpic.com/3eumv Comment please
49998,4,1556381147,Sat Apr 18 22:37:07 PDT 2009,NO_QUERY,KKVegas,Teasing and Mack 1-0 are taking Vegas by storm with another night of troublesome fun


In [63]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
0                                                                                                                      50000 non-null int64
1467810369                                                                                                             50000 non-null int64
Mon Apr 06 22:19:45 PDT 2009                                                                                           50000 non-null object
NO_QUERY                                                                                                               49999 non-null object
_TheSpecialOne_                                                                                                        49999 non-null object
@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D    49999 non-null object
dtypes: int64(2), object(4)
memory usage: 17.4 M

In [64]:
df.count()

0                                                                                                                      50000
1467810369                                                                                                             50000
Mon Apr 06 22:19:45 PDT 2009                                                                                           50000
NO_QUERY                                                                                                               49999
_TheSpecialOne_                                                                                                        49999
@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D    49999
dtype: int64

## This dataframe doesn't seem to have columns. Let's give it columns.

In [0]:
df.columns = ['face', 'ID', 'Time', 'none', 'username', 'Text']

In [67]:
df.count()

face        50000
ID          50000
Time        50000
none        49999
username    49999
Text        49999
dtype: int64

## Now it has columns, this seems better.

In [68]:
columns = [col for col in df.columns if not col.startswith('self')]
columns

['face', 'ID', 'Time', 'none', 'username', 'Text']

## Connect to database

In [0]:
#import sqlite3
#con = sqlite3.connect('documents.db')

## Save subreddit category info

In [0]:
#pd.read_csv("testdata.manual.2009.06.14.csv").to_sql("sentiment-labels", con)

  method=method,


# Data Cleaning

In [69]:
df.dtypes

face        int64 
ID          int64 
Time        object
none        object
username    object
Text        object
dtype: object

In [70]:
df.columns

Index(['face', 'ID', 'Time', 'none', 'username', 'Text'], dtype='object')

In [71]:
df.axes

[RangeIndex(start=0, stop=50000, step=1),
 Index(['face', 'ID', 'Time', 'none', 'username', 'Text'], dtype='object')]

In [72]:
df.index

RangeIndex(start=0, stop=50000, step=1)

In [73]:
df.size

300000

In [74]:
df.shape

(50000, 6)

In [75]:
print(df.iloc[312][5])

Poor Joshy is sick???   those damn tejanos!


In [76]:
print(df.iat[312,5])

Poor Joshy is sick???   those damn tejanos!


In [79]:
print(df.iloc['Text'][5])

TypeError: ignored

## Cleaning function

In [0]:
import re
def clean(s):
    s = s.replace(r'<lb>', "\n")
    s = s.replace(r'<tab>', "\i")
    s = re.sub(r'<br */*>', "\n", s)
    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
    s = s.replace("&amp;", "&")
    # markdown urls
    s = re.sub(r'\(https*://[^\)]*\)', "", s)
    # normal urls
    s = re.sub(r'https*://[^\s]*', "", s)
    s = re.sub(r'_+', ' ', s)
    s = re.sub(r'"+', '"', s)
    return str(s)

## Create new column in dataframe

In [0]:
df["selftext_clean"] = ''

# Iterate and clean

In [0]:
for i, row in df.iterrows():
    df.at[i, "selftext_clean"] = clean(row.selftext)

AttributeError: ignored

Check results

In [0]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",selftext_clean
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,


# NLP

## Load spaCy

In [0]:
import spacy
nlp = spacy.load('en')

## Iterate over all rows and perform NLP

In [0]:
for i, row in df.iterrows():
    if i % 1000 == 0:
        print(i)
    if(row["selftext_clean"] and len(str(row["selftext_clean"])) < 1000000):
        doc = nlp(str(row["selftext_clean"]))
        adjectives = []
        nouns = []
        verbs = []
        lemmas = []

        for token in doc:
            lemmas.append(token.lemma_)
            if token.pos_ == "ADJ":
                adjectives.append(token.lemma_)
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                nouns.append(token.lemma_)
            if token.pos_ == "VERB":
                verbs.append(token.lemma_)
                
        df.at[i, "selftext_lemma"] = " ".join(lemmas)                
        df.at[i, "selftext_nouns"] = " ".join(nouns)
        df.at[i, "selftext_adjectives"] = " ".join(adjectives)
        df.at[i, "selftext_verbs"] = " ".join(verbs)
        df.at[i, "selftext_nav"] = " ".join(nouns+adjectives+verbs)
        df.at[i, "no_tokens"] = len(lemmas)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000


## Check results

In [0]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",selftext_clean
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,


## Save to database

In [0]:
df.to_sql('posts_nlp', con)