In [1]:
import re, math
import operator
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedShuffleSplit
from random import randint
import ipynb.fs.defs.PeopleInfo as peopleInfo
from nltk.corpus import stopwords
from nltk import TweetTokenizer
from nltk import PorterStemmer
# import ipynb.fs.defs.TweetTextHandler as tweetTextHandler
import ipynb.fs.defs.FilterMethods as filterMethods
import sys, os
sys.path.append('../2_feature')
import ipynb.fs.defs.GetFeatures as getFeatures
global stop_words
stop_words = stopwords.words('english')
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from IPython.display import display, Image
plotly.tools.set_credentials_file(username='Adeline', api_key='Z5eltNtBQXqvI05ZFQtz')
# import plotly.offline as offline
# offline.init_notebook_mode(connected=True)



# Text Preprocessing

In [2]:
def replace_by_symbols(txt):
    txt = re.sub(r"https\S+", '', txt)
    txt = re.sub(r"http\S+", '', txt)
    txt = re.sub(r"pic.twitter.com\S+", '', txt)
    txt = re.sub(r"twitter.com/\S+", '', txt)
    txt = re.sub(r"\S+/\S+", '', txt)
    txt = re.sub(r"@\S+", '', txt)
    txt = re.sub(r"#\S+", '', txt)
    txt = re.sub(r"idk", 'i do not know', txt)   # idk: i don't know
    txt = re.sub(r"tho", 'though', txt)   # tho
    txt = re.sub(r"i\'m", 'i am', txt)
    txt = re.sub(r"you\'re", 'you are', txt)
    txt = re.sub(r"he\'s", 'he is', txt)
    txt = re.sub(r"she\'s", 'she is', txt)
    txt = re.sub(r"it\'s", 'it is', txt)
    txt = re.sub(r"we\'re", 'we are', txt)
    txt = re.sub(r"they\'re", 'they are', txt)
    txt = re.sub(r"isn\'t", 'is not', txt)
    txt = re.sub(r"don\'t", 'do not', txt)
    txt = re.sub(r"doesn\'t", 'does not', txt)
    txt = re.sub(r"didn\'t", 'did not', txt)
    txt = re.sub(r"wasn\'t", 'was not', txt)
    txt = re.sub(r"weren\'t", 'were not', txt)
    txt = re.sub(r"haven\'t", 'have not', txt)
    txt = re.sub(r"can\'t", 'can not', txt)
    txt = re.sub(r"couldn\'t", 'could not', txt)
    txt = re.sub(r"wouldn\'t", 'would not', txt)
    txt = re.sub(r"shouldn\'t", 'should not', txt)
    txt = re.sub(r"&amp", '', txt)
    return txt

In [3]:
def tokenize(text):
    text_list = re.findall('(?u)\\b[a-zA-Z]\\w{0,}\\b', text)
    return text_list

# Read LIWC Dictionary

In [78]:
liwc_category_dict = dict()
liwc_word_dict = dict()
with open('/home/adeline/Documents/Depression_Research/LIWC2007 Documents/Dictionaries/LIWC2007_English080730.dic') as open_file:
    raw_data = open_file.readlines()
    for row in raw_data[1:65]:
        row = row.strip().split('\t')
        liwc_category_dict[row[0]] = row[1]
    for row in raw_data[66:4553]:
        row = row.strip().split('\t')
        if row[0] in stop_words:
            liwc_word_dict[row[0]] = row[1:]
        elif '*' in row[0] and row[0][:-1] in stop_words:
            liwc_word_dict[row[0]] = row[1:]
        else:
            continue

In [79]:
liwc_word_dict

{'a': ['1', '10'],
 'about': ['1', '16', '17'],
 'above': ['1', '17', '252', '250'],
 'after': ['1', '17', '253', '250'],
 'again': ['1', '16', '253', '250'],
 'against': ['1', '17'],
 'all': ['1', '20', '131', '136'],
 'am': ['11', '1', '12', '14'],
 'an': ['1', '10'],
 'and': ['1', '18', '131', '138'],
 'any': ['1', '20', '131', '135'],
 'are': ['11', '1', '12', '14'],
 'as': ['1', '17', '18'],
 'at': ['1', '17', '252', '250'],
 'be': ['11', '1', '12'],
 'because': ['1', '18', '131', '133'],
 'been': ['11', '1', '12', '13'],
 'before': ['1', '17', '253', '250'],
 'being': ['11', '1', '12'],
 'below': ['1', '17', '252', '250'],
 'between': ['1', '17'],
 'both': ['1', '20', '131', '138', '252', '250'],
 'but': ['1', '18', '131', '139'],
 'by': ['1', '17'],
 'can': ['11', '1', '12', '14'],
 'did': ['11', '1', '12', '13'],
 'do': ['11', '1', '12', '14'],
 'does': ['11', '1', '12', '14'],
 'doing': ['11', '1', '12'],
 'down': ['1', '17', '252', '250'],
 'during': ['1', '17', '253', '250']

# Prepare Data 

In [4]:
patients = dict()
ordinarys = dict()
with open('../0_dataset/patient_ids') as r:
    for patient in r.readlines()[:100]:
        patient = patient.strip()
        patients[patient] = peopleInfo.Patient(patient)
with open('../0_dataset/ordinary_ids') as r:
    for ordinary in r.readlines()[:100]:
        ordinary = ordinary.strip()
        ordinarys[ordinary] = peopleInfo.Ordinary(ordinary)

In [6]:
patients = filterMethods.filter_user_by_tweet_number(patients)
ordinarys = filterMethods.filter_user_by_tweet_number(ordinarys)

Remove users:[]
Remove users:[]


In [7]:
base_texts = []
group_texts = []

for key in patients.keys():
    group_texts.append(replace_by_symbols('\n'.join(patients[key].getText())))

for key in ordinarys.keys():
    base_texts.append(replace_by_symbols('\n'.join(ordinarys[key].getText())))

corpus = base_texts + group_texts

# Simple Data Statistic

In [43]:
def df_filter(df):
    # 過濾空列
    filter = df['Text'] != ''
    df = df[filter]
    return df

In [44]:
base_tweets = []    # one element presented one tweet
group_tweets = []
for line in group_texts:
    group_tweets.extend(line.split('\n'))
for line in base_texts:
    base_tweets.extend(line.split('\n'))

In [60]:
# tweet_group_len = [len(tokenize(x)) for x in group_tweets]
# tweet_base_len = [len(tokenize(x)) for x in base_tweets]
tweet_group_len = [len(x.split()) for x in group_tweets]
tweet_base_len = [len(x.split()) for x in base_tweets]

In [61]:
dfGroupTweets = df_filter(pd.DataFrame({'Text':group_tweets,'len':tweet_group_len}))
dfBaseTweets = df_filter(pd.DataFrame({'Text':base_tweets,'len':tweet_base_len}))

In [69]:
print('=== Brief Info of Group Tweets ===')
dfGroupTweets['len'].describe()

=== Brief Info of Group Tweets ===


count    99668.000000
mean        11.514328
std          8.246411
min          0.000000
25%          4.000000
50%         10.000000
75%         18.000000
max         42.000000
Name: len, dtype: float64

In [70]:
print('=== Brief Info of Base Tweets ===')
dfBaseTweets['len'].describe()

=== Brief Info of Base Tweets ===


count    11908.000000
mean        10.626386
std          6.390020
min          0.000000
25%          5.000000
50%          9.000000
75%         15.000000
max         33.000000
Name: len, dtype: float64

# Analyse Sentence

In [8]:
def construct_stopwrods_dict(text_list, mydict):

    position = []
    for i, text in enumerate(text_list):
        if text in stop_words:    # 是個 stopword
            position.append(i)
        else:
            continue
    
    for i in range(len(position)):
        try:
            cur_pos = position[i]
            next_pos = position[i+1]
            key = '{0}-{1}-{2}'.format(text_list[cur_pos], text_list[next_pos], str(next_pos-cur_pos))
            mydict[key] = mydict.get(key, 0) + 1
        except IndexError:
            continue
    
    
    return mydict

In [9]:
mydict = dict()
for texts in group_texts:
    for text in texts.split('\n'):
        construct_stopwrods_dict(tokenize(text), mydict)

In [10]:
mydict2 = dict()
for texts in base_texts:
    for text in texts.split('\n'):
        construct_stopwrods_dict(tokenize(text), mydict2)

In [15]:
%store mydict >> stopwords_dict_depression

Writing 'mydict' (dict) to file 'stopwords_dict_depression'.


In [16]:
%store mydict2 >> stopwords_dict_ordinary

Writing 'mydict2' (dict) to file 'stopwords_dict_ordinary'.


# Intersect

In [80]:
intersect = mydict.keys() & mydict2.keys()

In [83]:
diffset = mydict.keys() - mydict2.keys()

In [92]:
total = 0
avg = 0
for key in diffset:
    total += mydict[key]
avg = total/len(diffset)

for key in diffset:
    if mydict[key] > avg:
        print('{0}:{1}'.format(key, mydict[key]))

how-at-3:3
s-what-3:3
some-so-2:6
how-and-5:3
you-was-4:4
t-s-2:5
up-i-1:13
these-and-3:6
with-are-2:9
very-my-2:3
only-i-2:11
she-so-2:9
can-of-3:4
in-the-5:3
was-it-4:10
this-about-1:9
up-his-1:8
his-so-3:3
other-at-1:3
have-s-2:19
how-their-1:4
up-in-2:14
all-in-3:4
off-their-1:3
she-should-1:11
you-or-5:7
what-m-3:9
over-them-1:8
over-and-3:7
now-no-2:4
her-because-1:10
on-which-2:5
because-are-2:11
up-for-3:5
t-or-3:4
m-on-5:3
as-or-3:4
their-is-4:5
during-and-2:3
and-down-3:4
have-when-3:7
once-to-2:3
their-be-3:6
it-me-4:4
both-my-1:6
the-d-5:4
and-too-4:3
s-when-4:4
we-you-3:5
of-to-6:9
or-with-4:3
is-or-4:3
what-your-1:8
to-against-2:8
my-do-7:4
just-no-1:5
to-we-3:12
s-was-4:8
am-am-3:4
again-s-2:3
an-but-3:11
she-on-3:6
the-she-5:3
who-how-2:5
him-he-1:19
her-i-2:3
it-from-4:3
down-to-3:3
so-into-3:3
what-d-2:10
i-a-5:3
themselves-from-1:5
m-doing-2:3
at-with-1:4
you-all-4:3
so-is-5:4
the-ve-5:8
are-be-3:7
s-from-1:9
can-if-3:3
m-how-3:4
them-to-3:5
but-them-3:5
i-the-1:5
be

own-have-2:3
more-and-3:11
you-through-3:3
too-your-2:3
by-my-3:3
when-a-3:10
t-and-1:6
for-m-4:7
as-that-2:4
had-with-2:4
again-you-1:3
their-were-2:3
have-from-3:4
just-the-5:4
can-i-2:5
not-in-4:18
a-are-8:3
was-do-2:3
some-m-4:3
same-in-3:3
ll-for-3:9
of-that-4:18
my-into-5:3
m-s-1:3
had-very-1:3
from-having-1:5
so-your-3:15
is-our-2:4
are-we-2:8
me-while-1:15
into-our-1:8
was-through-2:11
i-just-4:3
both-to-3:4
the-both-1:3
s-down-2:14
our-s-3:4
doing-my-1:22
own-and-1:5
a-during-3:4
my-about-5:4
m-by-3:134
who-had-1:10
who-we-1:9
me-me-3:21
or-what-2:4
and-and-7:6
as-was-2:11
there-when-2:3
this-more-2:9
with-each-1:10
was-myself-2:3
will-through-2:4
by-not-1:3
no-are-3:3
why-won-1:7
more-there-2:3
my-them-3:5
after-she-1:3
after-that-2:3
before-me-1:3
of-yourself-1:27
our-just-3:3
on-or-2:19
you-have-3:10
what-you-5:3
there-was-3:3
am-very-1:25
is-a-6:5
other-that-1:3
are-too-3:5
this-if-3:5
just-who-3:5
same-i-2:6
are-their-1:4
and-ve-4:13
ve-but-4:3
can-to-5:3
more-me-3:3
myse

be-through-2:3
having-a-2:3
i-of-3:10
or-more-3:3
up-did-3:3
its-in-1:8
we-and-4:5
to-here-4:3
to-only-2:5
i-you-4:8
of-all-5:3
i-for-2:16
it-that-3:14
some-other-1:7
so-the-2:14
the-for-7:8
a-re-3:6
when-me-4:3
this-should-1:6
was-there-3:3
m-but-4:4
you-in-6:3
in-because-2:15
have-what-1:7
but-do-3:17
at-for-4:4
through-s-3:3
down-s-2:3
about-and-4:8
there-on-2:9
re-of-3:3
should-i-1:15
it-can-2:15
some-they-3:4
ve-about-2:5
other-i-2:10
but-up-4:3
doing-for-3:3
a-to-7:16
this-how-3:3
re-very-1:3
with-this-5:4
up-having-1:4
before-just-1:3
and-there-4:3
an-and-6:5
some-will-2:4
you-is-5:7
at-this-3:4
own-you-2:4
who-and-1:4
but-who-1:9
for-if-2:9
me-now-2:17
by-herself-1:3
my-into-3:11
should-do-2:7
me-myself-2:4
she-this-2:3
at-them-3:3
after-her-1:7
myself-what-1:4
this-am-3:7
t-but-3:5
be-a-4:9
have-me-3:11
am-again-1:4
most-ve-5:3
up-that-2:7
just-up-3:10
too-have-2:3
him-on-2:4
my-m-7:3
no-t-1:4
a-d-4:5
just-over-2:13
we-both-1:23
from-have-2:4
was-ve-4:3
an-it-3:10
her-about-1: