In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from sklearn.metrics import confusion_matrix

Predict if a user will be an influencer on a tweet (> 100 retweets) based on volume in the following categories

USER PREDICTION ELEMENTS

FIELD                             DESCRIPTION
tweet_user_statuses_count:        count - user tweets
tweet_user_favourites_count:      count - user is a favorite of others
tweet_user_followers_count:       count - user is followed by others
tweet_user_friends_count:         count - user friends
tweet_user_listed_count:          count - user added to other's interest list


In [2]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

In [3]:
# import necessary libraries
import os
from flask import Flask, render_template, jsonify, request, redirect

from pprint import pprint

#################################################
# Flask Setup
#################################################
app = Flask(__name__)

#################################################
# Database Setup
#################################################

from flask_sqlalchemy import SQLAlchemy
from sqlalchemy.sql.expression import func

#Probably don't need these from SQLAlchemy: asc, desc, between, distinct, func, null, nullsfirst, nullslast, or_, and_, not_

db_path_flask_app = "sqlite:///../python/data/twitter_trends.db"
app.config['SQLALCHEMY_DATABASE_URI'] = os.environ.get('DATABASE_URL', '') or db_path_flask_app
# app.config['SQLALCHEMY_DATABASE_URI'] = db_path_flask_app

In [4]:
# Flask-SQLAlchemy database
db = SQLAlchemy(app)

  'SQLALCHEMY_TRACK_MODIFICATIONS adds significant overhead and '


In [5]:
class Tweet(db.Model):    
    __tablename__ = 'tweets'    
    # Defining the columns for the table 'tweets',    
    # which will hold tweets associated the search terms in the 'trends' table,    
    # which are referred to in that table as "twitter_tweet_name" 
    
    id = db.Column(db.Integer, primary_key=True)    
    updated_at = db.Column( db.DateTime )    
    tweet_id = db.Column( db.Integer )    
    tweet_id_str = db.Column( db.String(50), unique=True, nullable=False ) 
    
    # tweet_search_term = db.Column(db.Integer, db.ForeignKey('trends.twitter_tweet_name') )    
    tweet_search_term = db.Column(db.String(250))    
    tweet_created_at = db.Column(db.String(100))    
    tweet_is_a_quote_flag = db.Column( db.Boolean )    
    tweet_is_a_retweet_flag = db.Column( db.Boolean )    
    tweet_entities_hashtags_count = db.Column( db.Integer )    
    tweet_entities_user_mentions_count = db.Column( db.Integer )    
    tweet_favorite_counts = db.Column( db.Integer )    
    tweet_retweet_counts = db.Column( db.Integer )    
    tweet_lang = db.Column( db.String(10) )    
    tweet_source = db.Column(db.String(250))    
    tweet_text = db.Column(db.String(250))    
    tweet_user_id = db.Column( db.Integer )    
    tweet_user_id_str = db.Column( db.String(50) )    
    tweet_user_created_at = db.Column(db.String(100))    
    tweet_user_lang = db.Column( db.String(10) )    
    tweet_user_name = db.Column( db.String(100) )    
    tweet_user_screen_name = db.Column( db.String(100) )    
    tweet_user_description = db.Column( db.String(250) )    
    tweet_user_statuses_count = db.Column( db.Integer )    
    tweet_user_favourites_count = db.Column( db.Integer )    
    tweet_user_followers_count = db.Column( db.Integer )    
    tweet_user_friends_count = db.Column( db.Integer )    
    tweet_user_listed_count = db.Column( db.Integer )    
    
    def __repr__(self):        
        return f"<Tweet {self.tweet_search_term}: {self.tweet_id} [updated_at: {self.updated_at}>"

In [6]:
# Database schema for Twitter 'locations' table
class Location(db.Model):
    __tablename__ = 'locations'
    
    # Defining the columns for the table 'locations',
    # which will hold all of the locations in the U.S. for which
    # top trends data is available, as well as location specific
    # info like latitude/longitude
    id = db.Column(db.Integer, primary_key=True)
    updated_at = db.Column( db.DateTime )
    woeid = db.Column(db.Integer, unique=True, nullable=False)
    twitter_country = db.Column(db.String(100))
    tritter_country_code = db.Column(db.String(10))
    twitter_name = db.Column(db.String(250))
    twitter_parentid = db.Column(db.Integer)
    twitter_type = db.Column(db.String(50))
    country_name = db.Column(db.String(250))
    country_name_only = db.Column(db.String(250))
    country_woeid = db.Column(db.Integer)
    county_name = db.Column(db.String(250))
    county_name_only = db.Column(db.String(250))
    county_woeid = db.Column(db.Integer)
    latitude = db.Column(db.Float)
    longitude = db.Column(db.Float)
    name_full = db.Column(db.String(250))
    name_only = db.Column(db.String(250))
    name_woe = db.Column(db.String(250))
    place_type = db.Column(db.String(250))
    state_name = db.Column(db.String(250))
    state_name_only = db.Column(db.String(250))
    state_woeid = db.Column(db.Integer)
    timezone = db.Column(db.String(250))

    my_trends = db.relationship('Trend', backref=db.backref('my_location', lazy=True))
    
    def __repr__(self):
        return f"<Location {self.name_full} [updated_at: {self.updated_at}>"

# Database schema for Twitter 'trends' table
class Trend(db.Model):
    __tablename__ = 'trends'
    
    # Defining the columns for the table 'trends',
    # which will hold all of the top trends associated with
    # locations in the 'locations' table
    id = db.Column(db.Integer, primary_key=True)
    updated_at = db.Column( db.DateTime )
    woeid = db.Column(db.Integer, db.ForeignKey('locations.woeid') )
    twitter_as_of = db.Column(db.String(100))
    twitter_created_at = db.Column(db.String(100))
    twitter_name = db.Column(db.String(250))
    twitter_tweet_name = db.Column(db.String(250))
    twitter_tweet_promoted_content = db.Column(db.String(250))
    twitter_tweet_query = db.Column(db.String(250))
    twitter_tweet_url = db.Column(db.String(250))
    twitter_tweet_volume = db.Column(db.Float)

    # With more investigation, determined this is an
    # incorrect usage of relationship method below - removing it
    # locations = db.relationship('Location', backref=db.backref('trends', lazy=True))
     
    def __repr__(self):
        return f"<Trend {self.my_location.name_full}: {self.twitter_tweet_name} [updated_at: {self.updated_at}>"


In [7]:
# In the app code for this route, there is no need to do anything special
a_tweet = "#OpeningDay"

# Return a list of all locations that have the specified tweet in its top trends
# and then sort the results by tweet volume in descending order
results = db.session.query(Trend, Location).join(Location) \
                    .filter(Trend.twitter_tweet_name == a_tweet ) \
                    .order_by( Trend.twitter_tweet_volume.desc() ).all()

loc_list = []
for r in results:
    #print(f"Trend Information for {r.Trend.woeid} {r.Location.name_full}: {r.Trend.twitter_tweet_name} {r.Trend.twitter_tweet_volume}")
    loc_info = {
        'woeid': r.Location.woeid,
        'latitude': r.Location.latitude,
        'longitude': r.Location.longitude,
        'name_full': r.Location.name_full,
        'name_only': r.Location.name_only,
        'name_woe': r.Location.name_woe,
        'county_name': r.Location.county_name,
        'county_name_only': r.Location.county_name_only,
        'county_woeid': r.Location.county_woeid,
        'state_name': r.Location.state_name,
        'state_name_only': r.Location.state_name_only,
        'state_woeid': r.Location.state_woeid,
        'country_name': r.Location.country_name,
        'country_name_only': r.Location.country_name_only,
        'country_woeid': r.Location.country_woeid,
        'place_type': r.Location.place_type,
        'timezone': r.Location.timezone,
        'twitter_type': r.Location.twitter_type,
        'twitter_country': r.Location.twitter_country,
        'tritter_country_code': r.Location.tritter_country_code,
        'twitter_parentid': r.Location.twitter_parentid,

        'twitter_as_of': r.Trend.twitter_as_of,
        'twitter_created_at': r.Trend.twitter_created_at,
        'twitter_name': r.Trend.twitter_name,
        'twitter_tweet_name': r.Trend.twitter_tweet_name,
        'twitter_tweet_promoted_content': r.Trend.twitter_tweet_promoted_content,
        'twitter_tweet_query': r.Trend.twitter_tweet_query,
        'twitter_tweet_url': r.Trend.twitter_tweet_url,
        'twitter_tweet_volume': r.Trend.twitter_tweet_volume
    }

    loc_list.append(loc_info)

In [21]:
results = db.session.query(Tweet.tweet_user_id_str, \
                        func.max(Tweet.id).label("id"), \
                        func.max(Tweet.tweet_user_id_str).label("tweet_user_id_str"), \
                        func.max(Tweet.tweet_user_statuses_count).label("tweet_user_statuses_count"), \
                        func.max(Tweet.tweet_user_favourites_count).label("tweet_user_favourites_count"), \
                        func.max(Tweet.tweet_user_followers_count).label("tweet_user_followers_count"), \
                        func.max(Tweet.tweet_user_friends_count).label("tweet_user_friends_count"), \
                        func.max(Tweet.tweet_user_listed_count).label("tweet_user_listed_count"), \
                        func.max(Tweet.tweet_retweet_counts).label("tweet_retweet_counts")) \
                        .group_by(Tweet.tweet_user_id_str) \
                        .all()        

tweet_list = []    

for r in results:        
    tweet_info = { 'id': r.id,            
                  'tweet_user_id_str' : r.tweet_user_id_str ,                                           
                  'tweet_user_statuses_count': r.tweet_user_statuses_count ,            
                  'tweet_user_favourites_count' : r.tweet_user_favourites_count ,            
                  'tweet_user_followers_count' : r.tweet_user_followers_count ,            
                  'tweet_user_friends_count' : r.tweet_user_friends_count ,            
                  'tweet_user_listed_count' : r.tweet_user_listed_count,    
                  'tweet_retweet_counts' : r.tweet_retweet_counts}        
    tweet_list.append(tweet_info)

In [22]:
pprint(len(tweet_list))

32049


In [23]:
tweet = pd.DataFrame.from_dict(tweet_list)
tweet.head()

Unnamed: 0,id,tweet_retweet_counts,tweet_user_favourites_count,tweet_user_followers_count,tweet_user_friends_count,tweet_user_id_str,tweet_user_listed_count,tweet_user_statuses_count
0,26060,91,773,250,196,1000070228263493633,1,870
1,20218,31,5091,294,333,1000075196827566080,0,3260
2,34658,51,2154,65,128,1000088820652675072,0,279
3,33000,0,19169,532,286,1000090083821785088,2,3491
4,24629,1389,1084,41,143,1000128848455847936,0,3937


In [24]:
tweet.describe()

Unnamed: 0,id,tweet_retweet_counts,tweet_user_favourites_count,tweet_user_followers_count,tweet_user_friends_count,tweet_user_listed_count,tweet_user_statuses_count
count,32049.0,32049.0,32049.0,32049.0,32049.0,32049.0,32049.0
mean,19912.455989,1294.090393,16243.79,10806.45,1575.968,95.979188,25626.19
std,11287.823288,11921.900995,37453.11,326952.0,8848.977,1225.066403,69770.79
min,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,10214.0,0.0,648.0,120.0,185.0,0.0,1357.0
50%,20099.0,3.0,3837.0,407.0,472.0,4.0,6246.0
75%,29682.0,73.0,14962.0,1417.0,1215.0,28.0,22990.0
max,39031.0,533302.0,1005418.0,41865320.0,1057259.0,138404.0,3097116.0


In [28]:
tweet["success"] = np.where(tweet["tweet_retweet_counts"] > 99, 1, 0)

In [29]:
tweet.head()

Unnamed: 0,id,tweet_retweet_counts,tweet_user_favourites_count,tweet_user_followers_count,tweet_user_friends_count,tweet_user_id_str,tweet_user_listed_count,tweet_user_statuses_count,success
0,26060,91,773,250,196,1000070228263493633,1,870,0
1,20218,31,5091,294,333,1000075196827566080,0,3260,0
2,34658,51,2154,65,128,1000088820652675072,0,279,0
3,33000,0,19169,532,286,1000090083821785088,2,3491,0
4,24629,1389,1084,41,143,1000128848455847936,0,3937,1


In [30]:
# Assign X (data) and y (target)
X = tweet.drop(["id","tweet_user_id_str","tweet_retweet_counts","success"], axis=1)
y = tweet["success"]
print(X.shape, y.shape)

(32049, 5) (32049,)


Split our data into training and testing

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

Create a Logistic Regression Model

In [32]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Fit (train) or model using the training data

In [33]:
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Validate the model using the test data

In [34]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7661008487269096
Testing Data Score: 0.7675028079371022


Make predictions

In [35]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


In [19]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,1
2,0,1
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [36]:
confusion_matrix(y_test, predictions)

array([[6121,   25],
       [1838,   29]], dtype=int64)

In [37]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f"tn: {tn:3d}   fp: {fp:3d}")
print(f"fn: {fn:3d}   tp: {tp:3d}")

tn: 6121   fp:  25
fn: 1838   tp:  29


In [39]:
from joblib import dump, load
dump(classifier, 'influencer.model') 


['influencer.model']