In [19]:
import os
import sys

module_path = os.path.abspath(os.path.join('../code'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from string import punctuation

In [24]:
%load_ext line_profiler

In [21]:
from nltk.tag import StanfordNERTagger

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from nltk.corpus import stopwords

In [28]:
from utilities import give_source_name, isQuote, clean_up_text
from language_utilities import handle_single_tweet

## 1. Upload and clean the data
This .csv file is obtained by running the api_script.py for realDonaldTrump
#### Load data

In [10]:
df_trump = pd.read_csv('../data/realDonaldTrump.csv')

#### Feature engineer: sources and clean text

In [11]:
print 'number of unique sources: ', len(df_trump.source.unique())
source_types = df_trump.source.unique()
# human readable (shorter) labels
source_names = ['Web Client', 'iPhone', 'Android', 'iPad',\
                 'Media Studio', 'Instagram', 'Mobile Web']

#add a column with the human readable label
df_trump['type_source'] = map(lambda x: \
                        give_source_name(x, source_types, source_names),\
                        df_trump.source)

#making mask to id the source
mask_android = (df_trump.type_source == 'Android')
mask_iphone = (df_trump.type_source == 'iPhone')

number of unique sources:  7


In [12]:
#retweet signaled with quotation marks
df_trump["isQuote"] = map(isQuote, df_trump.text)
#define a mask to seperate tweets that are retweets from others
mask_android_noquote = ((df_trump.type_source == 'Android') & (df_trump.isQuote==False))
mask_android_isquote = ((df_trump.type_source == 'Android') & (df_trump.isQuote==True))

In [13]:
df_trump['clean_text'] = map(clean_up_text, df_trump.text)

## 2. Tokenize and tag

In [14]:
# Change the paths to point to the directory where you downloaded the Stanford tagger.
os.environ['STANFORD_MODELS'] = "../../Twitter/stanford-ner-2014-06-16/classifiers"
os.environ['CLASSPATH'] = "../../Twitter/stanford-ner-2014-06-16"
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')

In [15]:
wn_lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()

In [22]:
#remove words not indicative of vocabulary
remove_set = set(punctuation).union(set(stopwords.words('english')))

In [30]:
#time the function
#tweet = df_trump.clean_text[3035]
#%lprun -f handle_single_tweet handle_single_tweet(tweet, remove_set, st, wn_lemmatizer)

In [32]:
#keep only tweets that we think are really from Donald Trump
df_real_trump = df_trump[mask_android_noquote]
print len(df_real_trump)

1159


In [None]:
dict_list = []
vocabulary_list = []
index = 0
for tweet in df_real_trump.clean_text:
    if index%100 == 0: print index
    processed_tokens, dict_by_type = handle_single_tweet(tweet, remove_set, st, wn_lemmatizer)
    vocabulary_list.append(processed_tokens)
    dict_list.append(dict_by_type)
    index += 1

0
100
