In [1]:
import re, math
import operator
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedShuffleSplit
from random import randint
import ipynb.fs.defs.PeopleInfo as peopleInfo
from nltk.corpus import stopwords
from nltk import TweetTokenizer
from nltk import PorterStemmer
# import ipynb.fs.defs.TweetTextHandler as tweetTextHandler
import ipynb.fs.defs.FilterMethods as filterMethods
import sys, os
sys.path.append('../2_feature')
import ipynb.fs.defs.GetFeatures as getFeatures
global stop_words
stop_words = stopwords.words('english')
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from IPython.display import display, Image
plotly.tools.set_credentials_file(username='Adeline', api_key='Z5eltNtBQXqvI05ZFQtz')
# import plotly.offline as offline
# offline.init_notebook_mode(connected=True)



# Text Preprocessing

In [2]:
def replace_by_symbols(txt):
    txt = re.sub(r"https\S+", '', txt)
    txt = re.sub(r"http\S+", '', txt)
    txt = re.sub(r"pic.twitter.com\S+", '', txt)
    txt = re.sub(r"twitter.com/\S+", '', txt)
    txt = re.sub(r"\S+/\S+", '', txt)
    txt = re.sub(r"@\S+", '', txt)
    txt = re.sub(r"#\S+", '', txt)
    txt = re.sub(r"idk", 'i do not know', txt)   # idk: i don't know
    txt = re.sub(r"tho", 'though', txt)   # tho
    txt = re.sub(r"i\'m", 'i am', txt)
    txt = re.sub(r"you\'re", 'you are', txt)
    txt = re.sub(r"he\'s", 'he is', txt)
    txt = re.sub(r"she\'s", 'she is', txt)
    txt = re.sub(r"it\'s", 'it is', txt)
    txt = re.sub(r"we\'re", 'we are', txt)
    txt = re.sub(r"they\'re", 'they are', txt)
    txt = re.sub(r"isn\'t", 'is not', txt)
    txt = re.sub(r"don\'t", 'do not', txt)
    txt = re.sub(r"doesn\'t", 'does not', txt)
    txt = re.sub(r"didn\'t", 'did not', txt)
    txt = re.sub(r"wasn\'t", 'was not', txt)
    txt = re.sub(r"weren\'t", 'were not', txt)
    txt = re.sub(r"haven\'t", 'have not', txt)
    txt = re.sub(r"can\'t", 'can not', txt)
    txt = re.sub(r"couldn\'t", 'could not', txt)
    txt = re.sub(r"wouldn\'t", 'would not', txt)
    txt = re.sub(r"shouldn\'t", 'should not', txt)
    txt = re.sub(r"&amp", '', txt)
    return txt

In [3]:
def tokenize(text):
    text_list = re.findall('(?u)\\b[a-zA-Z]\\w{0,}\\b', text)
    return text_list

# Read LIWC Dictionary

In [78]:
liwc_category_dict = dict()
liwc_word_dict = dict()
with open('/home/adeline/Documents/Depression_Research/LIWC2007 Documents/Dictionaries/LIWC2007_English080730.dic') as open_file:
    raw_data = open_file.readlines()
    for row in raw_data[1:65]:
        row = row.strip().split('\t')
        liwc_category_dict[row[0]] = row[1]
    for row in raw_data[66:4553]:
        row = row.strip().split('\t')
        if row[0] in stop_words:
            liwc_word_dict[row[0]] = row[1:]
        elif '*' in row[0] and row[0][:-1] in stop_words:
            liwc_word_dict[row[0]] = row[1:]
        else:
            continue

In [None]:
liwc_word_dict

# Prepare Data 

In [4]:
patients = dict()
ordinarys = dict()
with open('../0_dataset/patient_ids') as r:
    for patient in r.readlines()[:100]:
        patient = patient.strip()
        patients[patient] = peopleInfo.Patient(patient)
with open('../0_dataset/ordinary_ids') as r:
    for ordinary in r.readlines()[:100]:
        ordinary = ordinary.strip()
        ordinarys[ordinary] = peopleInfo.Ordinary(ordinary)

In [6]:
patients = filterMethods.filter_user_by_tweet_number(patients)
ordinarys = filterMethods.filter_user_by_tweet_number(ordinarys)

Remove users:[]
Remove users:[]


In [7]:
base_texts = []
group_texts = []

for key in patients.keys():
    group_texts.append(replace_by_symbols('\n'.join(patients[key].getText())))

for key in ordinarys.keys():
    base_texts.append(replace_by_symbols('\n'.join(ordinarys[key].getText())))

corpus = base_texts + group_texts

# Simple Data Statistic

In [43]:
def df_filter(df):
    # 過濾空列
    filter = df['Text'] != ''
    df = df[filter]
    return df

In [44]:
base_tweets = []    # one element presented one tweet
group_tweets = []
for line in group_texts:
    group_tweets.extend(line.split('\n'))
for line in base_texts:
    base_tweets.extend(line.split('\n'))

In [60]:
# tweet_group_len = [len(tokenize(x)) for x in group_tweets]
# tweet_base_len = [len(tokenize(x)) for x in base_tweets]
tweet_group_len = [len(x.split()) for x in group_tweets]
tweet_base_len = [len(x.split()) for x in base_tweets]

In [61]:
dfGroupTweets = df_filter(pd.DataFrame({'Text':group_tweets,'len':tweet_group_len}))
dfBaseTweets = df_filter(pd.DataFrame({'Text':base_tweets,'len':tweet_base_len}))

In [69]:
print('=== Brief Info of Group Tweets ===')
dfGroupTweets['len'].describe()

=== Brief Info of Group Tweets ===


count    99668.000000
mean        11.514328
std          8.246411
min          0.000000
25%          4.000000
50%         10.000000
75%         18.000000
max         42.000000
Name: len, dtype: float64

In [70]:
print('=== Brief Info of Base Tweets ===')
dfBaseTweets['len'].describe()

=== Brief Info of Base Tweets ===


count    11908.000000
mean        10.626386
std          6.390020
min          0.000000
25%          5.000000
50%          9.000000
75%         15.000000
max         33.000000
Name: len, dtype: float64

# Analyse Sentence

In [8]:
def construct_stopwrods_dict(text_list, mydict):

    position = []
    for i, text in enumerate(text_list):
        if text in stop_words:    # 是個 stopword
            position.append(i)
        else:
            continue
    
    for i in range(len(position)):
        try:
            cur_pos = position[i]
            next_pos = position[i+1]
            key = '{0}-{1}-{2}'.format(text_list[cur_pos], text_list[next_pos], str(next_pos-cur_pos))
            mydict[key] = mydict.get(key, 0) + 1
        except IndexError:
            continue
    
    
    return mydict

In [9]:
mydict = dict()
for texts in group_texts:
    for text in texts.split('\n'):
        construct_stopwrods_dict(tokenize(text), mydict)

In [10]:
mydict2 = dict()
for texts in base_texts:
    for text in texts.split('\n'):
        construct_stopwrods_dict(tokenize(text), mydict2)

In [15]:
%store mydict >> stopwords_dict_depression

Writing 'mydict' (dict) to file 'stopwords_dict_depression'.


In [16]:
%store mydict2 >> stopwords_dict_ordinary

Writing 'mydict2' (dict) to file 'stopwords_dict_ordinary'.


# Intersect of Set

In [80]:
intersect = mydict.keys() & mydict2.keys()   # 聯集的 Key

# Difference of Set

In [180]:
diffsetGroup = mydict.keys() - mydict2.keys()    # 差集（Group 的 key）

In [181]:
diffsetBase = mydict2.keys() - mydict2.keys()    # 差集（Base 的 key）

In [188]:
print('The length of set belongs to Group: {0}\tBase: {1}'.format(len(diffsetGroup),len(diffsetBase)))

The length of set belongs to Group: 25014	Base: 0


In [170]:
def plot_num_distribution(mydict):
    keys = list(mydict.keys())
    vals = list(mydict.values())
    trace1 = go.Bar(
        x=keys,
        y=vals,
        name='',
        marker=dict(color='rgb(49,130,189)'))
    data = [trace1]
    layout = go.Layout(
        title = 'Stopwords Difference Set',
#         shapes = [{
#             'type': 'line',
#             'x0': 5,
#             'y0': 0,
#             'x1': 5,
#             'y1': 6000,
#             'line': {
#                 'color': 'red',
#                 'width': 1,
#                 'dash':'dashdot'
#             }
#         },
#         {
#             'type': 'line',
#             'x0': 8,
#             'y0': 0,
#             'x1': 8,
#             'y1': 6000,
#             'line': {
#                 'color': 'red',
#                 'width': 1,
#                 'dash':'dashdot'
#             },
#         }]
    )
    fig = go.Figure(data=data, layout=layout)
    return fig
    #     py.iplot(fig, filename='stwdiff')
    
    
def create_num_distribution(key_list, original_dict):
    new_dict = dict()
    for key in key_list:
        new_dict[key] = original_dict[key]
    
    num_dict = dict()
    for key in key_list:
        num = new_dict[key]
        num_dict[num] = dist.get(num, 0) + 1
    
    return num_dict

In [182]:
py.iplot(plot_num_distribution(create_num_distribution(diffsetGroup, mydict)), filename='StopwordsDiffGroup')

In [173]:
groupFeatureOrigin = dict()
for key in diffset:
    if 4 < mydict[key] > 8:
        groupFeatureOrigin[key] = mydict[key]
    else:
        continue

In [218]:
def writeToFileBasic(data, filename):
    print(type(data))
    if type(data) == dict:
        print('Writing dictionary to {0}...'.format(filename))
        with open(filename, 'w') as open_file:
            for key, val in dictionary.items():
                open_file.write('{0}\t{1}\n'.format(key, val))
    elif type(data) == list:
        print('Writing list to {0}...'.format(filename))
        with open(filename, 'w') as open_file:
            for val in data:
                open_file.write('{0}\n'.format(val))
    else:
        print('InputFileTypeError')

In [178]:
writeToFileBasic(groupFeatureOrigin, 'groupFeatureOrigin')

Writing dictionary to groupFeatureOrigin...


In [189]:
py.iplot(plot_num_distribution(create_num_distribution(diffsetBase, mydict2)), filename='StopwordsDiffBase')

# Create Pattern List

In [230]:
def convertToPatternList(feature_list):
    %%time
    regStrs = []
    for line in feature_list:
        items = line.strip().split('\t')
        keys = items[0].split('-')
        regStr = ''
        tmpRegStr1 = ''
        tmpRegStr2 = ''
        tmpRegStr3 = ''
        tmpRegStr4 = ''
        regGeneralStr = ''
        # General part
        for i in range(int(keys[2])-1):
            regGeneralStr += '\s\w+'
        regGeneralStr += '\s'
        # C1: 'a b'
        tmpRegStr1 = '^' + keys[0] +  regGeneralStr + keys[1] + '$'
        # C2: 'a b '
        tmpRegStr2 = '^' + keys[0] + regGeneralStr + keys[1] + '\s'
        # C3: ' a b'
        tmpRegStr3 = '\s' + keys[0] + regGeneralStr + keys[1] + '$'
        # C4: ' a b '
        tmpRegStr4 = '\s' + keys[0] + regGeneralStr + keys[1] + '\s'
        regStr = '('+tmpRegStr1+')|('+tmpRegStr2+')|('+tmpRegStr3+')|('+tmpRegStr4+')'
        regStrs.append(regStr)
    return regStrs

In [228]:
writeToFileBasic(convertToPatternList(groupFeatureOrigin), 'groupFeatureOriginPatternList')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.63 µs
<class 'list'>
Writing list to groupFeatureOriginPatternList...


In [231]:
regStrs = convertToPatternList(groupFeatureOrigin)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.63 µs


In [249]:
def getHitNumber(text):
    
    hit = 0
    hitTweetNum = 0
    tweetLengthNum = 0
    tweetNum = 0
    
    
    if type(text) == list:
        text_list = text.split('\n')  # 把一個user原本以\n串連的發文分開
        tweetNum = len(text_list)
        for tweet in text_list:
            hitFlag = False
            tweet = " ".join(tokenize(tweet))
            for regStr in regStrs:
                if re.search(re.compile(regStr), tweet) != None:
                    hit += 1
                    if hitFlag == False:
                        hitTweetNum += 1
                        hitFlag = True
                    else:
                        pass
        return hit/hitTweetNum, 
    
    elif type(text) == str: # 丟進來就是一段普通的字串
        hit = 0
        tweet = " ".join(tokenize(text))
        for regStr in regStrs:
            if re.search(re.compile(regStr), tweet) != None:
                hit += 1
        
        
    else:
        print('Please stop putting weird stuff!')
        return 'Not acceptable'

In [253]:
%%time
groupHitNum = []
for person in group_texts:
    groupHitNum.append(getHitNumber(person))

KeyboardInterrupt: 

In [254]:
groupHitNum

[24, 501, 439, 193, 10, 390, 137, 46]