#  Fit a Model  and Predict Home State Electoral College Results
This notebook attempts to create a model that can predict whether a tweet came from a state that voted Republican or Democratic during the 2016 Presidential election.

In [15]:
import pandas as pd
import mlutils
from sklearn import svm, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import json
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [16]:
#set random seed
RANDOM_SEED = 655

## Read Cleaned Tweet Dataset

In [17]:
#read file
df = pd.read_json("./intermediate_data/cleaned_tweet_data.json")
df_nike = df[df['tweet_full_text'].str.contains("nike")]

In [18]:
df_nike.head()

Unnamed: 0,user_created_at,tweet_full_text,tweet_favorite_count,tweet_created_at,user_name,user_profile_image_url_https,user_profile_sidebar_border_color,user_profile_sidebar_fill_color,user_profile_text_color,user_profile_use_background_image,user_screen_name,user_profile_background_color,user_friends_count,user_followers_count,user_description,user_location,user_location_State,state_political_values,tweet_w_user_descript,tweet_favorited
1,2008-12-26 09:30:23+00:00,hero fdny likesforlikes promo music instagood ...,0,2018-09-07 16:24:59+00:00,Yung Cut Up (Videos),https://pbs.twimg.com/profile_images/945333114...,FFFFFF,EFEFEF,333333,True,yungcutup,131516,5489,13241,all business inquiries contact cluuxxgmail.com...,"Miami, Florida",FL,Republican,hero fdny likesforlikes promo music instagood ...,False
5,2008-07-23 16:43:42+00:00,real donald trump it's time for me to stock up...,0,2018-09-07 16:24:35+00:00,tazman69,https://pbs.twimg.com/profile_images/743752426...,C0DEED,DDEEF6,333333,True,tazman69,C0DEED,175,64,"enjoys cycling, running & spending a relaxing ...","Austin, TX",TX,Republican,real donald trump it's time for me to stock up...,False
7,2016-06-17 17:22:12+00:00,"nike good job on shaking the whole world, i se...",0,2018-09-07 16:24:13+00:00,Jim Christopher Aure,https://pbs.twimg.com/profile_images/935288930...,C0DEED,DDEEF6,333333,True,jaure302,F5F8FA,129,88,barber pole barber pole barber pole,"Newark, DE",DE,Democrat,"nike good job on shaking the whole world, i se...",False
11,2012-03-27 22:35:39+00:00,colin kaepernick's business partner nike sends...,0,2018-09-07 16:23:40+00:00,2nd Vote,https://pbs.twimg.com/profile_images/282028719...,FFFFFF,DDEEF6,333333,True,2ndVote,FFFFFF,2433,3298,conservative watchdog for corporate activism. ...,"Nashville, Tenn.",TN,Republican,colin kaepernick's business partner nike sends...,False
17,2013-01-16 21:46:58+00:00,hey nike i stood for our country. i served my ...,1,2018-09-07 16:23:09+00:00,James,https://pbs.twimg.com/profile_images/561207015...,000000,EFEFEF,333333,True,NoGunsNoGlory,131516,17225,19427,i am toxic masculinity. i make my own bbq sauc...,Rural Texas,TX,Republican,hey nike i stood for our country. i served my ...,True


## Split Dataset into a Training Set and a Test Set

In [19]:
train_df, dev_df, test_df= \
    np.split(df_nike.sample(frac=1, random_state=RANDOM_SEED),
    [int(.8*len(df_nike)), int(.9*len(df_nike))]
)

## Convert Text Data to Features Using Bigram Vectorizer

In [20]:
bigram_vectorizer = TfidfVectorizer(stop_words='english', min_df=500, ngram_range=(1,2))
X_train = bigram_vectorizer.fit_transform(train_df.tweet_full_text)

y_train = list(train_df.state_political_values)

## Train a Random Forest Classifier

In [21]:
clf = RandomForestClassifier(max_depth=5, random_state=RANDOM_SEED).fit(X_train, y_train)

## Generate Dev Data

In [22]:
X_dev = bigram_vectorizer.transform(dev_df.tweet_full_text)

y_dev = list(dev_df.state_political_values)

## Create Dummy Classifiers
Dummy classifiers are a way to compare a model's results if the regression was done with simple rules. For example, the "most frequent" strategy just picks what the most frequent y value was in the training set.

In [23]:
dummy_clf_most_frequent = DummyClassifier(strategy="most_frequent", random_state=RANDOM_SEED)
dummy_clf_most_frequent.fit(X_train, y_train)

dummy_clf_uniform = DummyClassifier(strategy="uniform", random_state=RANDOM_SEED)
dummy_clf_uniform.fit(X_train, y_train)

DummyClassifier(random_state=655, strategy='uniform')

## Create Predictions for Dev Set

In [24]:
lr_dev_preds = clf.predict(X_dev)
rand_dev_preds = dummy_clf_uniform.predict(X_dev)
mf_dev_preds = dummy_clf_most_frequent.predict(X_dev)

## Score Predictions for Dev Set

In [25]:
lr_f1 = f1_score(y_dev, lr_dev_preds, average='macro')
rand_f1 = f1_score(y_dev, rand_dev_preds, average='macro')
mf_f1 = f1_score(y_dev, mf_dev_preds, average='macro')

In [26]:
print("Model Score:", lr_f1)
print("Dummy Score (random):", rand_f1)
print("Dummy Score (most frequent):", mf_f1)

Model Score: 0.4739130434782609
Dummy Score (random): 0.5330188679245282
Dummy Score (most frequent): 0.3529411764705882


## Results
The random forest model was not able to outperform the random dummy f1 score. This suggests that it is unlikely that there is a strong predictive power of a tweet's text when trying to determine the political leaning of the region it came from. This implies that if Nike is doing target marketing on Twitter with politicized messages, it should not segment its messages by the political leaning of the states where Twitter users reside. This does not appear to be an accurate measure of their reaction to the campaign or perspective on Nike products.

---

**Author:** [Nick Capaldini](mailto:nickcaps@umich.edu), University of Michigan, January 19, 2022

---