In [8]:
import numpy as np
import twitter, re, datetime
import pandas as pd

In [9]:
twitter_keys = {
    'consumer_key':        '',
    'consumer_secret':     '',
    'access_token_key':    '',
    'access_token_secret': ''
}

api = twitter.Api(
    consumer_key         =   twitter_keys['consumer_key'],
    consumer_secret      =   twitter_keys['consumer_secret'],
    access_token_key     =   twitter_keys['access_token_key'],
    access_token_secret  =   twitter_keys['access_token_secret'],
    tweet_mode = 'extended'
)

In [10]:
type(api)

twitter.api.Api

In [11]:
#TweetMiner function from Mike Roman

class TweetMiner(object):

    
    def __init__(self, api, result_limit = 20):
        
        self.api = api        
        self.result_limit = result_limit
        

    def mine_user_tweets(self, user="HillaryClinton", mine_retweets=False, max_pages=20):

        data           =  []
        last_tweet_id  =  False
        page           =  1
        
        while page <= max_pages:
            
            if last_tweet_id:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit, max_id=last_tweet_id - 1, include_rts=mine_retweets)
                statuses = [ _.AsDict() for _ in statuses]
            else:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit, include_rts=mine_retweets)
                statuses = [_.AsDict() for _ in statuses]
                
            for item in statuses:
                # Using try except here.
                # When retweets = 0 we get an error (GetUserTimeline fails to create a key, 'retweet_count')
                try:
                    mined = {
                        'tweet_id':        item['id'],
                        'handle':          item['user']['screen_name'],
                        'retweet_count':   item['retweet_count'],
                        'text':            item['full_text'],
                        'mined_at':        datetime.datetime.now(),
                        'created_at':      item['created_at'],
                    }
                
                except:
                        mined = {
                        'tweet_id':        item['id'],
                        'handle':          item['user']['screen_name'],
                        'retweet_count':   0,
                        'text':            item['full_text'],
                        'mined_at':        datetime.datetime.now(),
                        'created_at':      item['created_at'],
                    }
                
                last_tweet_id = item['id']
                data.append(mined)
                
            page += 1
            
        return data

In [12]:
miner = TweetMiner(api,result_limit = 200)

In [13]:
hillary = miner.mine_user_tweets(user="HillaryClinton")
trump = miner.mine_user_tweets(user="realDonaldTrump")

In [14]:
#Converting into Dataframe

In [15]:
hillary_df = pd.DataFrame(hillary)
trump_df = pd.DataFrame(trump)

In [16]:
trump_df.shape

(2312, 6)

In [17]:
hillary_df.shape

(2575, 6)

In [18]:
tweets = pd.concat([trump_df,hillary_df],axis=0)
tweets.shape

(4887, 6)

In [19]:
#Creating Target
y = tweets['handle'].map(lambda  x: 1 if x == 'realDonaldTrump' else 0).values

In [20]:
#Creating Data
tweets_text = tweets['text'].values

In [21]:
#Building Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [22]:
#Tokenization
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [23]:
tfv = TfidfVectorizer(ngram_range=(2,4),max_features=2000)
X=tfv.fit_transform(tweets_text).todense() #todense returns a matrix 

In [24]:
print(X.shape)

(4887, 2000)


In [26]:
clf = LogisticRegression(penalty='l2',C=1.0)
clf.fit(X,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [39]:
source_test = [
    "The presidency doesn’t change who you are—it reveals who you are. And we’ve seen all we need to of Donald Trump.",
    "Crooked Hillary is spending tremendous amounts of Wall Street money on false ads against me. She is a very dishonest person!",
    "Just had a very good call with @SwedishPM Stefan Löfven who assured me that American citizen A$AP Rocky will be treated fairly. Likewise, I assured him that A$AP was not a flight risk and offered to personally vouch for his bail, or an alternative....",
    "Think what it would be without the 3 year Witch Hunt and Fake News Media, in partnership with the Democrats!"
]

In [40]:
Xtest = tfv.transform(source_test)
pd.DataFrame(clf.predict_proba(Xtest), columns=["Proba_Hillary", "Proba_Trump"])

Unnamed: 0,Proba_Hillary,Proba_Trump
0,0.923371,0.076629
1,0.298277,0.701723
2,0.252668,0.747332
3,0.046032,0.953968


In [41]:
#This shows that the probability that the first tweet is by Hillary is 92%
#And that the second tweet is by Trump is 70%
#This suggests that are model is working properly
#The last tweet again has an accuracy of 95%