In [1]:
import pandas as pd

In [3]:
header_list = ['target', "twitter_id", 'date', 'flag', 'user', 'text']
df = pd.read_csv('../sentiment140.csv', encoding='iso-8859-1', names=header_list)

Content
It contains the following 6 fields:

target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

ids: The id of the tweet ( 2087)

date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

flag: The query (lyx). If there is no query, then this value is NO_QUERY.

user: the user that tweeted (robotickilldozr)

text: the text of the tweet (Lyx is cool)

In [17]:
df.head()

Unnamed: 0,target,twitter_id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [18]:
df[df['text']==""]

Unnamed: 0,target,twitter_id,date,flag,user,text


In [19]:
df.isnull().any()

target        False
twitter_id    False
date          False
flag          False
user          False
text          False
dtype: bool

In [20]:
df.shape

(1600000, 6)

## Separate the data after the LDA allocation

In [5]:
extra_index = pd.read_csv('extra_depressed.csv')
real_depressed = pd.read_csv('sentiment_tweets3.csv')

In [8]:
real_depressed.drop(10313, inplace = True)

In [11]:
real_depressed = real_depressed[real_depressed['label']==1][['message', 'label']]

In [12]:
real_depressed

Unnamed: 0,message,label
8000,The lack of this understanding is a small but ...,1
8001,i just told my parents about my depression and...,1
8002,depression is something i don't speak about ev...,1
8003,Made myself a tortilla filled with pb&j. My de...,1
8004,@WorldofOutlaws I am gonna need depression med...,1
...,...,...
10308,Many sufferers of depression aren't sad; they ...,1
10309,No Depression by G Herbo is my mood from now o...,1
10310,What do you do when depression succumbs the br...,1
10311,Ketamine Nasal Spray Shows Promise Against Dep...,1


In [13]:
extra_index.head()

Unnamed: 0.1,Unnamed: 0,index_depressed
0,0,798
1,1,4285
2,2,5868
3,3,14375
4,4,24364


In [15]:
sentiment_140_depressed = df.loc[extra_index['index_depressed']][['text', 'target']]

In [20]:
sentiment_140_depressed['target'] = 1

In [18]:
real_depressed.columns = ['text', 'target']

In [21]:
real_depressed.head()

Unnamed: 0,text,target
8000,The lack of this understanding is a small but ...,1
8001,i just told my parents about my depression and...,1
8002,depression is something i don't speak about ev...,1
8003,Made myself a tortilla filled with pb&j. My de...,1
8004,@WorldofOutlaws I am gonna need depression med...,1


In [22]:
sentiment_140_depressed.head()

Unnamed: 0,text,target
798,My stress always culminates with physical pain.,1
4285,@bekibutton Its very tough isn't it I'm virtua...,1
5868,@mistressmatisse That link isn't working http...,1
14375,Oh no.. post camp depression is kicking in,1
24364,I need a pot noodle. Random but true! Also i'm...,1


In [23]:
depressed_data = pd.concat([real_depressed, sentiment_140_depressed])

In [25]:
no_depressed = list(set(df.index)-set(sentiment_140_depressed.index))
df = df.loc[no_depressed, :]

In [27]:
# sentiment 140 without depressed post
df.head()

Unnamed: 0,target,twitter_id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [28]:
# all twitter depressed post
depressed_data.head()

Unnamed: 0,text,target
8000,The lack of this understanding is a small but ...,1
8001,i just told my parents about my depression and...,1
8002,depression is something i don't speak about ev...,1
8003,Made myself a tortilla filled with pb&j. My de...,1
8004,@WorldofOutlaws I am gonna need depression med...,1


## Tokenize tweets with NLTK

In [21]:
from nltk.tokenize import word_tokenize

In [22]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/student.unimelb.edu.au/jprasetiyo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
df['tokenized_text'] = df['text']

0          @switchfoot http://twitpic.com/2y1zl - Awww, t...
1          is upset that he can't update his Facebook by ...
2          @Kenichan I dived many times for the ball. Man...
3            my whole body feels itchy and like its on fire 
4          @nationwideclass no, it's not behaving at all....
                                 ...                        
1599995    Just woke up. Having no school is the best fee...
1599996    TheWDB.com - Very cool to hear old Walt interv...
1599997    Are you ready for your MoJo Makeover? Ask me f...
1599998    Happy 38th Birthday to my boo of alll time!!! ...
1599999    happy #charitytuesday @theNSPCC @SparksCharity...
Name: tokenized_text, Length: 1600000, dtype: object

In [25]:
df['tokenized_text'] = df["tokenized_text"].apply(lambda x: x.lower()).apply(word_tokenize)

In [26]:
df.head()

Unnamed: 0,target,twitter_id,date,flag,user,text,tokenized_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[@, switchfoot, http, :, //twitpic.com/2y1zl, ..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, ca, n't, update, his, fa..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[@, kenichan, i, dived, many, times, for, the,..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[@, nationwideclass, no, ,, it, 's, not, behav..."


In [27]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = [ 'what', 'which', 'who', 'whom', 'this', 'that', "that'll",'these', 'those', 'am', 'is', 'was', 'were',
 'be','been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but',
 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
 'through', 'during', 'before','after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on','off', 'over',
 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
 'few', 'more', 'most', 'other', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can',
 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn',
  "isn't", 'ma', 'mightn',"mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
 "wasn't", 'weren', "weren't", 'won',"won't", 'wouldn',"wouldn't", "http"]


# remove stop words
df['tokenized_text'] = df["tokenized_text"].apply(lambda x: [word for word in x if word not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student.unimelb.edu.au/jprasetiyo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
# remove non alphanumeric words
df['tokenized_text'] = df["tokenized_text"].apply(lambda x: [word for word in x if word.isalpha()])

In [14]:
df.to_pickle("../sentiment140_tokenized.pickle")

In [10]:
#df.to_csv("sentiment140_tokenized.csv", index=False)

In [11]:
#df_token = pd.read_csv('sentiment140_tokenized.csv')

In [55]:
df

Unnamed: 0,target,twitter_id,date,flag,user,text,tokenized_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[switchfoot, http, awww, bummer, you, shoulda,..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, he, ca, update, his, facebook, texting..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[kenichan, i, dived, many, times, ball, manage..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, like, its, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[nationwideclass, it, behaving, i, mad, i, i, ..."
...,...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...,"[woke, school, best, feeling, ever]"
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,"[cool, hear, old, walt, interviews, http]"
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,"[are, you, ready, your, mojo, makeover, ask, m..."
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,"[happy, 38th, birthday, my, boo, alll, time, t..."
