In [None]:
import tensorflow as tf

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.pipeline import TransformerMixin, FeatureUnion, make_pipeline, make_union, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve
%matplotlib inline
#from xgboost import XGBClassifier as xgb



In [None]:
!nvidia-smi

In [6]:
myHome = '/home/ubuntu/toolkit'

In [7]:
trump_tweets = pd.read_csv(myHome+'/realdonaldtrump_tweets.csv')
clinton_tweets = pd.read_csv(myHome+'/hillaryclinton_tweets.csv')

trump_tweets["author"] = "Trump"
clinton_tweets["author"] = "Clinton"

tweets = pd.concat([trump_tweets, clinton_tweets])
tweets = tweets[[ u'id', u'text', u'created_at', u'favorite_count', u'retweet_count', u'author']]
tweets["id"] = tweets["id"].astype("str")
tweets = tweets.reset_index()
tweets = tweets.drop("index",1)
tweets.head()

Unnamed: 0,id,text,created_at,favorite_count,retweet_count,author
0,783436108176629760,Thank you ARIZONA! This is a MOVEMENT like nob...,Tue Oct 04 22:38:18 +0000 2016,4229.0,1729.0,Trump
1,783393314309484544,My childcare plan makes a difference for worki...,Tue Oct 04 19:48:15 +0000 2016,11110.0,4719.0,Trump
2,783391423663964160,I will be watching the great Governor @Mike_Pe...,Tue Oct 04 19:40:44 +0000 2016,14211.0,4809.0,Trump
3,783390310969651200,"Join me in Reno, Nevada tomorrow at 3:30pm! #A...",Tue Oct 04 19:36:19 +0000 2016,5743.0,2053.0,Trump
4,783149570721144832,"Join me in Reno, Nevada on Wednesday at 3:30pm...",Tue Oct 04 03:39:42 +0000 2016,13895.0,6189.0,Trump


In [8]:
def percentInCaps(s):
    inCaps = sum(1 for c in s if c.isupper())
    inLowerCase = sum(1 for c in s if c.islower())
    return inCaps/float(inCaps + inLowerCase) 

def num_hashtags(s):
    return sum(1 for c in s if c == "#")

def num_mentions(s):
    return sum(1 for c in s if c == "@")

punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
def percentPunctuation(s):
    return sum(1 for c in s if c in punctuation)/float(len(s))

def manrt(s):
    return ((s[0] in ['"',"'"]) + (s[1] == '@') + (s[-1] in ['"',"'"]) == 3) + 0

def sh(s):
    return int('-H' in s)

def percentesc(s):
    return sum(1 for c in s if c in '!?')/float(len(s))

    
tweets["percentPunctuation"] = tweets["text"].apply(lambda x: percentPunctuation(x)) 
tweets["percentEscQ"] = tweets["text"].apply(percentesc,1)
tweets["percentInCaps"] = tweets["text"].apply(lambda x: percentInCaps(x))    
tweets["numHastags"] = tweets["text"].apply(lambda x: num_hashtags(x))
tweets["numMentions"] = tweets["text"].apply(lambda x: num_mentions(x))
tweets["manrt"] = tweets["text"].apply(manrt,1)
tweets['-H'] = tweets["text"].apply(sh,1)


# Get a summary of Clinton vs. Trump Tweet attributes
tweets.groupby("author").mean()

Unnamed: 0_level_0,favorite_count,retweet_count,percentPunctuation,percentEscQ,percentInCaps,numHastags,numMentions,manrt,-H
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Clinton,5991.002167,3144.962229,0.063471,0.000879,0.093032,0.129102,0.491022,0.0,0.029102
Trump,16600.378303,5974.324837,0.05906,0.00795,0.139253,0.478707,0.661486,0.096985,0.000622


In [9]:
tweets

Unnamed: 0,id,text,created_at,favorite_count,retweet_count,author,percentPunctuation,percentEscQ,percentInCaps,numHastags,numMentions,manrt,-H
0,783436108176629760,Thank you ARIZONA! This is a MOVEMENT like nob...,Tue Oct 04 22:38:18 +0000 2016,4229.0,1729.0,Trump,0.056338,0.007042,0.330275,0,0,0,0
1,783393314309484544,My childcare plan makes a difference for worki...,Tue Oct 04 19:48:15 +0000 2016,11110.0,4719.0,Trump,0.060606,0.000000,0.068627,1,0,0,0
2,783391423663964160,I will be watching the great Governor @Mike_Pe...,Tue Oct 04 19:40:44 +0000 2016,14211.0,4809.0,Trump,0.033333,0.016667,0.075269,0,1,0,0
3,783390310969651200,"Join me in Reno, Nevada tomorrow at 3:30pm! #A...",Tue Oct 04 19:36:19 +0000 2016,5743.0,2053.0,Trump,0.093750,0.010417,0.217391,2,0,0,0
4,783149570721144832,"Join me in Reno, Nevada on Wednesday at 3:30pm...",Tue Oct 04 03:39:42 +0000 2016,13895.0,6189.0,Trump,0.079365,0.007937,0.217391,1,0,0,0
5,783129603363659776,Thank you Colorado! #MAGA\nhttps://t.co/3KWOl2...,Tue Oct 04 02:20:21 +0000 2016,20271.0,8390.0,Trump,0.164948,0.010309,0.304348,1,0,0,0
6,783065029172088832,We must bring the truth directly to hard-worki...,Mon Oct 03 22:03:46 +0000 2016,22735.0,10633.0,Trump,0.051471,0.000000,0.093458,1,0,0,0
7,783059449128759296,"Thank you Pueblo, Colorado! \n#TrumpRally #Ame...",Mon Oct 03 21:41:35 +0000 2016,15042.0,6178.0,Trump,0.117647,0.009804,0.197368,2,0,0,0
8,782946021957599232,"Join me in Henderson, Nevada on Wednesday at 1...",Mon Oct 03 14:10:52 +0000 2016,14300.0,5837.0,Trump,0.096774,0.010753,0.227273,1,0,0,0
9,782728287718150144,Just announced that Iraq (U.S.) is preparing f...,Sun Oct 02 23:45:40 +0000 2016,37654.0,13857.0,Trump,0.051852,0.014815,0.066038,0,0,0,0


In [10]:
# Extract month,day,and hour from timestamp and append back to data frame

weekdayDict = {"0": "M", "1": "Tu", "2": "W", "3": "Th", "4": "F", "5": "Sa", "6": "Su"}

tweets["created_at"] = pd.to_datetime(tweets["created_at"])
tweets["hour"] = tweets["created_at"].apply(lambda x: x.hour)
tweets["day"] = tweets["created_at"].apply(lambda x: x.weekday())
tweets["month"] = tweets["created_at"].apply(lambda x: x.month)

# Take a look at the new frame
tweets.head()

Unnamed: 0,id,text,created_at,favorite_count,retweet_count,author,percentPunctuation,percentEscQ,percentInCaps,numHastags,numMentions,manrt,-H,hour,day,month
0,783436108176629760,Thank you ARIZONA! This is a MOVEMENT like nob...,2016-10-04 22:38:18,4229.0,1729.0,Trump,0.056338,0.007042,0.330275,0,0,0,0,22,1,10
1,783393314309484544,My childcare plan makes a difference for worki...,2016-10-04 19:48:15,11110.0,4719.0,Trump,0.060606,0.0,0.068627,1,0,0,0,19,1,10
2,783391423663964160,I will be watching the great Governor @Mike_Pe...,2016-10-04 19:40:44,14211.0,4809.0,Trump,0.033333,0.016667,0.075269,0,1,0,0,19,1,10
3,783390310969651200,"Join me in Reno, Nevada tomorrow at 3:30pm! #A...",2016-10-04 19:36:19,5743.0,2053.0,Trump,0.09375,0.010417,0.217391,2,0,0,0,19,1,10
4,783149570721144832,"Join me in Reno, Nevada on Wednesday at 3:30pm...",2016-10-04 03:39:42,13895.0,6189.0,Trump,0.079365,0.007937,0.217391,1,0,0,0,3,1,10


In [12]:
xcols = list(tweets.columns)
for name in ['author','text','id','created_at', 'month']:
    # removing month because it's too predictive; artifact of different times
    while name in xcols:
        xcols.remove(name)

X_others, X_text, y = tweets[xcols], tweets['text'], tweets.author == 'Trump'

In [13]:
X_others_train, X_others_test, X_text_train, X_text_test, y_train, y_test = train_test_split(X_others, X_text, y,
                                                                                             test_size = 0.3,
                                                                                             random_state = 42)

In [14]:
kfold = KFold(len(X_others_train), n_folds=5, shuffle=True, random_state=0)

In [16]:
kfold

sklearn.cross_validation.KFold(n=4512, n_folds=5, shuffle=True, random_state=0)

In [17]:
X_others

Unnamed: 0,favorite_count,retweet_count,percentPunctuation,percentEscQ,percentInCaps,numHastags,numMentions,manrt,-H,hour,day
0,4229.0,1729.0,0.056338,0.007042,0.330275,0,0,0,0,22,1
1,11110.0,4719.0,0.060606,0.000000,0.068627,1,0,0,0,19,1
2,14211.0,4809.0,0.033333,0.016667,0.075269,0,1,0,0,19,1
3,5743.0,2053.0,0.093750,0.010417,0.217391,2,0,0,0,19,1
4,13895.0,6189.0,0.079365,0.007937,0.217391,1,0,0,0,3,1
5,20271.0,8390.0,0.164948,0.010309,0.304348,1,0,0,0,2,1
6,22735.0,10633.0,0.051471,0.000000,0.093458,1,0,0,0,22,0
7,15042.0,6178.0,0.117647,0.009804,0.197368,2,0,0,0,21,0
8,14300.0,5837.0,0.096774,0.010753,0.227273,1,0,0,0,14,0
9,37654.0,13857.0,0.051852,0.014815,0.066038,0,0,0,0,23,6


In [18]:
ngram_caps = list(range(1,11))
aucs = []
for param in ngram_caps:
    textmodel = Pipeline([('vect', CountVectorizer(stop_words = 'english', ngram_range=(1, param), lowercase = True,
                                                max_df= 0.99, min_df = 0.001)), 
                        ('tfidf', TfidfTransformer()), ('svd', TruncatedSVD(n_components = 1000)),
                         ('model', LogisticRegression())])##,)
    scores = cross_val_score(textmodel, X_text_train, y_train, cv=kfold)
#     print(scores)
#     print("Mean Score: ", np.mean(scores))

    # Build the Model
    textmodel.fit(X_text_train, y_train)

    preds = textmodel.predict(X_text_test)
    acc = sum(preds == y_test)/len(preds)
#     print('Param: ' + str(param))
#     print('ACC: ' + str(acc))
    probs = textmodel.predict_proba(X_text_test)
    roc = roc_curve(y_test, probs[:,1])
    auc = roc_auc_score(y_test, probs[:,1])
#     print('AUC: ' + str(auc))
    aucs.append(auc)

In [None]:
plt.plot(ngram_caps, aucs)
plt.title('AUC vs ngram upper bound')
plt.xlabel('ngram upper bound')
plt.ylabel('ROC AUC score')