In [1]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score

### Warren

In [2]:
warren_tweets = pickle.load(open('./tweetsent/tweets/old/warren_tweets_old.pkl', 'rb'))
warren_senti_scores = pickle.load(open('./tweetsent/senti_scores/warren_senti_scores.pkl', 'rb'))

In [3]:
# Finding Thresholds
num_retweets_warren = np.array([warren_tweets[i]['retweet_count']
                                for i in range(len(warren_tweets))])

In [4]:
# Finding max character
warren_max_char=0 #316
for i in warren_tweets[0:]:
    warren_max_char = max(warren_max_char, warren_senti_scores[i['full_text']]['usage']['text_characters'])

In [5]:
# Create a corpus
corpus = set()
for tweet in warren_senti_scores:
    corpus.update({i['text'] for i in warren_senti_scores[tweet]['keywords']})
warren_sorted_corpus = sorted(corpus)

with open('FeatureData/warren_corpus.pk', 'wb') as file:
    pickle.dump(warren_sorted_corpus, file)

with open('FeatureData/warren_corpus.pk', 'rb') as file:
    warren_sorted_corpus = pickle.load(file)    

In [71]:
len(warren_sorted_corpus)

5712

In [6]:
# Create Feature Matrix
warren_features = []
warren_labels = []
warren_feature_names = ['sadness', 'joy', 'fear', 'disgust', 'anger',
                        'sentiment', 'character'] + [i for i in warren_sorted_corpus]

for i in warren_tweets:
    # Ambigious discarded Binary Labels
    if i['retweet_count'] <= 1083: #1083
        warren_labels.append(-1)
    elif i['retweet_count'] >= 1614: #1614
        warren_labels.append(1)
    else:
        continue
    
    # Feature
    tweet_feature = []
    for j,k in warren_senti_scores[i['full_text']]['emotion']['document']['emotion'].items():
        tweet_feature.append(k)
    tweet_feature.append(warren_senti_scores[i['full_text']]['sentiment']['document']['score'])
    warren_feature_names.append('sentiment')
    
    tweet_feature.append(warren_senti_scores[i['full_text']]['usage']['text_characters']/warren_max_char)
    warren_feature_names.append('character')
    
    # One-hot Encoded Features
    text_relevance = dict({sent['text']:sent['relevance'] for sent in warren_senti_scores[i['full_text']]['keywords']})
    tweet_onehot=[]
    for keys in warren_sorted_corpus:
        
        tweet_onehot.append(0 if keys not in text_relevance.keys() else text_relevance[keys])
    tweet_feature.extend(tweet_onehot)
    
    # Add all to features matrix
    warren_features.append(tweet_feature)

with open('FeatureData/warren_features.pk', 'wb') as file:
    pickle.dump([warren_features, warren_feature_names, warren_labels], file)
with open('FeatureData/warren_features.pk', 'rb') as file:
    warren_features, warren_feature_names, warren_labels = pickle.load(file)

In [72]:
sum(warren_labels)

4

In [7]:
X_train_warren, X_test_warren, y_train_warren, y_test_warren = train_test_split(warren_features, warren_labels, test_size=1/3, random_state=42)

In [63]:
lr_warren = LogisticRegression(C=2.0)
lr_warren.fit(X_train_warren, y_train_warren)
lr_warren.score(X_test_warren, y_test_warren)
print(f1_score(lr_warren.predict(X_test_warren), y_test_warren))
print(f1_score(lr_warren.predict(X_train_warren), y_train_warren))

0.6771929824561403
0.9828767123287672


In [110]:
warren_train_acc = lr_warren.score(X_train_warren, y_train_warren)
warren_test_acc = lr_warren.score(X_test_warren, y_test_warren)
warren_train_f1 = f1_score(lr_warren.predict(X_test_warren), y_test_warren)
warren_test_f1 = f1_score(lr_warren.predict(X_train_warren), y_train_warren)
with open('evaluate/warren_evaluate.pk', 'wb') as file:
    pickle.dump([warren_train_acc, warren_test_acc, warren_train_f1, warren_test_f1], file)

In [124]:
print("\t","Train Acc\t", "Test Acc\t", "Train F1 Score\t", "Test F1 Score")
print("Warren\t", '{:3.4f}'.format(warren_train_acc), "\t", '{:3.4f}'.format(warren_test_acc), "\t",
      '{:3.4f}'.format(warren_train_f1), "\t", '{:3.4f}'.format(warren_test_f1))

	 Train Acc	 Test Acc	 Train F1 Score	 Test F1 Score
Warren	 0.9828 	 0.6833 	 0.6772 	 0.9829


In [77]:
lr_warren.coef_[0][:10]

array([-0.00765621,  0.46294905,  0.79182539,  1.60234741,  1.22169369,
       -0.3738697 ,  0.13445727, -0.08218968,  0.        ,  0.        ])

In [101]:
sorted(list(zip(lr_warren.coef_[0], warren_feature_names)), key=lambda x: x[0], reverse=True)[:10]

[(1.7616293778995165, '@realDonaldTrump'),
 (1.709477065578446, 'Brett Kavanaugh'),
 (1.658493776949671, 'Coretta Scott King'),
 (1.6023474110525664, 'disgust'),
 (1.4814470638650068, 'Jeff Sessions'),
 (1.4751044536089133, 'Tonight'),
 (1.4701056189010773, 'fight'),
 (1.4381140949258584, 'Russia'),
 (1.428683080655806, 'Affordable Care Act'),
 (1.3593645411876183, 'plan')]

In [64]:
svm_warren = SVC(C=4.0, kernel='linear') # rbf -> .50, linear -> 0.652
svm_warren.fit(X_train_warren, y_train_warren)
svm_warren.score(X_test_warren, y_test_warren)
f1_score(svm_warren.predict(X_test_warren), y_test_warren)
print(f1_score(svm_warren.predict(X_train_warren), y_train_warren))

0.9982788296041308


In [11]:
# Dump and load to pickle file.
with open('Predictions/warren_LR.pk', 'wb') as file:
    pickle.dump(lr_warren, file)
with open('Predictions/warren_SVM.pk', 'wb') as file:
    pickle.dump(svm_warren, file)

with open('Predictions/warren_LR.pk', 'rb') as file: 
    lr_warren = pickle.load(file)    
with open('Predictions/warren_SVM.pk', 'rb') as file:    
    svm_warren = pickle.load(file)

### Biden

In [12]:
biden_tweets = pickle.load(open('./tweetsent/tweets/old/biden_tweets_old.pkl', 'rb'))
biden_senti_scores = pickle.load(open('./tweetsent/senti_scores/biden_senti_scores.pkl', 'rb'))

In [13]:
# Finding Thresholds
num_retweets_biden = np.array([biden_tweets[i]['retweet_count']
                               for i in range(len(biden_tweets))])

In [14]:
# Finding max character
biden_max_char=0 #315
for i in biden_tweets[0:]:
    biden_max_char = max(biden_max_char, biden_senti_scores[i['full_text']]['usage']['text_characters'])

In [15]:
# Create a corpus
corpus = set()
for tweet in biden_senti_scores:
    corpus.update({i['text'] for i in biden_senti_scores[tweet]['keywords']})
biden_sorted_corpus = sorted(corpus)

with open('FeatureData/biden_corpus.pk', 'wb') as file:
    pickle.dump(biden_sorted_corpus, file)

with open('FeatureData/biden_corpus.pk', 'rb') as file:
    biden_sorted_corpus = pickle.load(file)    

In [113]:
# Create Feature Matrix
biden_features = []
biden_labels = []
biden_feature_names = ['sadness', 'joy', 'fear', 'disgust', 'anger',
                        'sentiment', 'character'] + [i for i in biden_sorted_corpus]

for i in biden_tweets:
    # Ambigious discarded Binary Labels
    if i['retweet_count'] <= 208: #247: #302:
        biden_labels.append(-1)
    elif i['retweet_count'] >= 302: # 398: #784
        biden_labels.append(1)
    else:
        continue
    
    # Feature
    tweet_feature = []
    for j,k in biden_senti_scores[i['full_text']]['emotion']['document']['emotion'].items():
        tweet_feature.append(k)
    tweet_feature.append(biden_senti_scores[i['full_text']]['sentiment']['document']['score'])
    biden_feature_names.append('sentiment')
    
    tweet_feature.append(biden_senti_scores[i['full_text']]['usage']['text_characters']/biden_max_char)
    biden_feature_names.append('character')
    
    # One-hot Encoded Features
    text_relevance = dict({sent['text']:sent['relevance'] for sent in biden_senti_scores[i['full_text']]['keywords']})
    tweet_onehot=[]
    for keys in biden_sorted_corpus:
        
        tweet_onehot.append(0 if keys not in text_relevance.keys() else text_relevance[keys])
    tweet_feature.extend(tweet_onehot)
    
    # Add all to features matrix
    biden_features.append(tweet_feature)

with open('FeatureData/biden_features.pk', 'wb') as file:
    pickle.dump([biden_features, biden_feature_names, biden_labels], file)

with open('FeatureData/biden_features.pk', 'rb') as file:
    biden_features, biden_feature_names, biden_labels = pickle.load(file)

In [17]:
sum(biden_labels)

-114

In [18]:
X_train_biden, X_test_biden, y_train_biden, y_test_biden = train_test_split(biden_features, biden_labels, test_size=1/3, random_state=42)

In [65]:
lr_biden = LogisticRegression(C=2.0)
lr_biden.fit(X_train_biden, y_train_biden)
lr_biden.score(X_test_biden, y_test_biden)
f1_score(lr_biden.predict(X_test_biden), y_test_biden)
f1_score(lr_biden.predict(X_train_biden), y_train_biden)

0.9408224674022067

In [109]:
biden_train_acc = lr_biden.score(X_train_biden, y_train_biden)
biden_test_acc = lr_biden.score(X_test_biden, y_test_biden)
biden_train_f1 = f1_score(lr_biden.predict(X_test_biden), y_test_biden)
biden_test_f1 = f1_score(lr_biden.predict(X_train_biden), y_train_biden)
with open('evaluate/biden_evaluate.pk', 'wb') as file:
    pickle.dump([biden_train_acc, biden_test_acc, biden_train_f1, biden_test_f1], file)

In [79]:
biden_feature_names[:10]

['sadness',
 'joy',
 'fear',
 'disgust',
 'anger',
 'sentiment',
 'character',
 '#1010means',
 '#1is2many',
 '#AARPIowaForum']

In [78]:
lr_biden.coef_[0][:10]

array([ 0.31148925,  0.88325437,  0.81126871,  0.43724336,  0.29143923,
       -0.32549082,  1.98244647, -0.17973047, -0.06138752,  0.        ])

In [93]:
sorted(list(zip(lr_biden.coef_[0], biden_feature_names)), key=lambda x: x[0], reverse=True)[:10]

[(2.0865519399851644, 'Hillary'),
 (1.982446467185269, 'character'),
 (1.9547568908522612, 'tonight'),
 (1.7145053873530272, 'friend'),
 (1.5651393775243028, 'America'),
 (1.502783331318645, 'Donald Trump'),
 (1.4938376603200787, 'Jill'),
 (1.4203565942119496, 'families'),
 (1.4163040779048284, 'South Carolina'),
 (1.3767573309764107, 'President Trump')]

In [66]:
svm_biden = SVC(C=4.0, kernel='linear') # rbf -> .86, linear -> 0.85
svm_biden.fit(X_train_biden, y_train_biden)
svm_biden.score(X_test_biden, y_test_biden)
f1_score(svm_biden.predict(X_test_biden), y_test_biden)
f1_score(svm_biden.predict(X_train_biden), y_train_biden)

0.9814995131450828

In [21]:
# Dump and load to pickle file.
with open('Predictions/biden_LR.pk', 'wb') as file:
    pickle.dump(lr_biden, file)
with open('Predictions/biden_SVM.pk', 'wb') as file:
    pickle.dump(svm_biden, file)

with open('Predictions/biden_LR.pk', 'rb') as file: 
    lr_biden = pickle.load(file)    
with open('Predictions/biden_SVM.pk', 'rb') as file:    
    svm_biden = pickle.load(file)

### Bernie

In [22]:
bernie_tweets = pickle.load(open('./tweetsent/tweets/old/bernie_tweets_old.pkl', 'rb'))
bernie_senti_scores = pickle.load(open('./tweetsent/senti_scores/bernie_senti_scores.pkl', 'rb'))

In [23]:
# Finding Thresholds
num_retweets_bernie = np.array([bernie_tweets[i]['retweet_count']
                               for i in range(len(bernie_tweets))])

In [24]:
# Finding max character
bernie_max_char=0 # 304
for i in bernie_tweets:
    bernie_max_char = max(bernie_max_char, bernie_senti_scores[i['full_text']]['usage']['text_characters'])

In [25]:
# Create a corpus
corpus = set()
for tweet in bernie_senti_scores:
    corpus.update({i['text'] for i in bernie_senti_scores[tweet]['keywords']})
bernie_sorted_corpus = sorted(corpus)

with open('FeatureData/bernie_corpus.pk', 'wb') as file:
    pickle.dump(bernie_sorted_corpus, file)
    
with open('FeatureData/bernie_corpus.pk', 'rb') as file:
    bernie_sorted_corpus = pickle.load(file)  

In [111]:
# Create Feature Matrix
bernie_features = []
bernie_labels = []
bernie_feature_names = ['sadness', 'joy', 'fear', 'disgust', 'anger',
                        'sentiment', 'character'] + [i for i in bernie_sorted_corpus]

for i in bernie_tweets:
    # Ambigious discarded Binary Labels
    if i['retweet_count'] <= 1080:
        bernie_labels.append(-1)
    elif i['retweet_count'] >= 1612:
        bernie_labels.append(1)
    else:
        continue
    
    # Feature
    tweet_feature = []
    for j,k in bernie_senti_scores[i['full_text']]['emotion']['document']['emotion'].items():
        tweet_feature.append(k)
    tweet_feature.append(bernie_senti_scores[i['full_text']]['sentiment']['document']['score'])
    bernie_feature_names.append('sentiment')
    
    tweet_feature.append(bernie_senti_scores[i['full_text']]['usage']['text_characters']/bernie_max_char)
    bernie_feature_names.append('character')
    
    # One-hot Encoded Features
    text_relevance = dict({sent['text']:sent['relevance'] for sent in bernie_senti_scores[i['full_text']]['keywords']})
    tweet_onehot=[]
    for keys in bernie_sorted_corpus:
        
        tweet_onehot.append(0 if keys not in text_relevance.keys() else text_relevance[keys])
    tweet_feature.extend(tweet_onehot)
    
    # Add all to features matrix
    bernie_features.append(tweet_feature)

with open('FeatureData/bernie_features.pk', 'wb') as file:
    pickle.dump([bernie_features, bernie_feature_names, bernie_labels], file)

with open('FeatureData/bernie_features.pk', 'rb') as file:
    bernie_features, bernie_feature_names, bernie_labels = pickle.load(file)

In [27]:
sum(bernie_labels)

-28

In [28]:
X_train_bernie, X_test_bernie, y_train_bernie, y_test_bernie = train_test_split(bernie_features, bernie_labels, test_size=1/3, random_state=42)

In [67]:
lr_bernie = LogisticRegression(C=2.0)
lr_bernie.fit(X_train_bernie, y_train_bernie)
lr_bernie.score(X_test_bernie, y_test_bernie)
f1_score(lr_bernie.predict(X_test_bernie), y_test_bernie)
f1_score(lr_bernie.predict(X_train_bernie), y_train_bernie)

0.9720998531571219

In [108]:
bernie_train_acc = lr_bernie.score(X_train_bernie, y_train_bernie)
bernie_test_acc = lr_bernie.score(X_test_bernie, y_test_bernie)
bernie_train_f1 = f1_score(lr_bernie.predict(X_test_bernie), y_test_bernie)
bernie_test_f1 = f1_score(lr_bernie.predict(X_train_bernie), y_train_bernie)
with open('evaluate/bernie_evaluate.pk', 'wb') as file:
    pickle.dump([bernie_train_acc, bernie_test_acc, bernie_train_f1, bernie_test_f1], file)

In [80]:
lr_bernie.coef_[0][:10]

array([ 0.38313301, -1.31607588,  0.06624864,  0.56532138,  0.96273504,
       -0.0046789 , -1.3072561 , -0.14151721,  0.        ,  0.04517574])

In [92]:
sorted(list(zip(lr_bernie.coef_[0], bernie_feature_names)), key=lambda x: x[0], reverse=True)[:10]

[(1.5369562219748851, 'child'),
 (1.3101264200356815, 'hour'),
 (1.2471883236022463, 'Cardi B'),
 (1.2454094871199144, 'CEO'),
 (1.2069942736005168, 'dollars'),
 (1.1846947311483609, 'Republicans'),
 (1.1550651698827914, 'today'),
 (1.144075574059014, 'racist'),
 (1.1224045548406671, '21st Century Economic Bill of Rights'),
 (1.1183580225873515, 'fact')]

In [68]:
svm_bernie = SVC(C=4.0, kernel='linear') # rbf -> .86, linear -> 0.85
svm_bernie.fit(X_train_bernie, y_train_bernie)
svm_bernie.score(X_test_bernie, y_test_bernie)
f1_score(svm_bernie.predict(X_test_bernie), y_test_bernie)
f1_score(svm_bernie.predict(X_train_bernie), y_train_bernie)

0.9970414201183432

In [31]:
# Dump and load to pickle file.
with open('Predictions/bernie_LR.pk', 'wb') as file:
    pickle.dump(lr_bernie, file)
with open('Predictions/bernie_SVM.pk', 'wb') as file:
    pickle.dump(svm_bernie, file)

with open('Predictions/bernie_LR.pk', 'rb') as file: 
    lr_bernie = pickle.load(file)    
with open('Predictions/bernie_SVM.pk', 'rb') as file:    
    svm_bernie = pickle.load(file)

### Yang

In [39]:
yang_tweets = pickle.load(open('./tweetsent/tweets/old/yang_tweets_old.pkl', 'rb'))
yang_senti_scores = pickle.load(open('./tweetsent/senti_scores/yang_senti_scores.pkl', 'rb'))

In [40]:
# Finding Thresholds
num_retweets_yang = np.array([yang_tweets[i]['retweet_count']
                             for i in range(len(yang_tweets))])

In [41]:
# Finding max character
yang_max_char= 0 #329
for i in yang_tweets:
    yang_max_char = max(yang_max_char, yang_senti_scores[i['full_text']]['usage']['text_characters'])

In [42]:
# Create a corpus
corpus = set()
for tweet in yang_senti_scores:
    corpus.update({i['text'] for i in yang_senti_scores[tweet]['keywords']})
yang_sorted_corpus = sorted(corpus)

with open('FeatureData/yang_corpus.pk', 'wb') as file:
    pickle.dump(yang_sorted_corpus, file)

with open('FeatureData/yang_corpus.pk', 'rb') as file:
    yang_sorted_corpus = pickle.load(file)  

In [112]:
# Create Feature Matrix
yang_features = []
yang_labels = []
yang_feature_names = ['sadness', 'joy', 'fear', 'disgust', 'anger',
                        'sentiment', 'character'] + [i for i in yang_sorted_corpus]

for i in yang_tweets:
    # Ambigious discarded Binary Labels
    if i['retweet_count'] <= 335: #880:
        yang_labels.append(-1)
    elif i['retweet_count'] >= 524: #1612:
        yang_labels.append(1)
    else:
        continue
    
    # Feature
    tweet_feature = []
    for j,k in yang_senti_scores[i['full_text']]['emotion']['document']['emotion'].items():
        tweet_feature.append(k)
    tweet_feature.append(yang_senti_scores[i['full_text']]['sentiment']['document']['score'])
    yang_feature_names.append('sentiment')
    
    tweet_feature.append(yang_senti_scores[i['full_text']]['usage']['text_characters']/yang_max_char)
    yang_feature_names.append('character')
    
    # One-hot Encoded Features
    text_relevance = dict({sent['text']:sent['relevance'] for sent in yang_senti_scores[i['full_text']]['keywords']})
    tweet_onehot=[]
    for keys in yang_sorted_corpus:
        
        tweet_onehot.append(0 if keys not in text_relevance.keys() else text_relevance[keys])
    tweet_feature.extend(tweet_onehot)
    
    # Add all to features matrix
    yang_features.append(tweet_feature)

with open('FeatureData/yang_features.pk', 'wb') as file:
    pickle.dump([yang_features, yang_feature_names, yang_labels], file)

with open('FeatureData/yang_features.pk', 'rb') as file:
    yang_features, yang_feature_names, yang_labels = pickle.load(file)

In [56]:
sum(yang_labels)

-16

In [57]:
X_train_yang, X_test_yang, y_train_yang, y_test_yang = train_test_split(yang_features, yang_labels, test_size=1/3, random_state=42)

In [69]:
lr_yang = LogisticRegression(C=2.0)
lr_yang.fit(X_train_yang, y_train_yang)
lr_yang.score(X_test_yang, y_test_yang)
f1_score(lr_yang.predict(X_test_yang), y_test_yang)
f1_score(lr_yang.predict(X_train_yang), y_train_yang)

0.9282442748091604

In [106]:
yang_train_acc = lr_yang.score(X_train_yang, y_train_yang)
yang_test_acc = lr_yang.score(X_test_yang, y_test_yang)
yang_train_f1 = f1_score(lr_yang.predict(X_test_yang), y_test_yang)
yang_test_f1 = f1_score(lr_yang.predict(X_train_yang), y_train_yang)
with open('evaluate/yang_evaluate.pk', 'wb') as file:
    pickle.dump([yang_train_acc, yang_test_acc, yang_train_f1, yang_test_f1], file)

In [82]:
lr_yang.coef_[0][:10]

array([ 0.60769436, -0.89138319,  0.01539488,  2.22248272, -0.47985534,
       -1.04411181,  4.4340393 ,  0.33329203,  0.22658596,  0.        ])

In [104]:
sorted(list(zip(lr_yang.coef_[0], yang_feature_names)), key=lambda x: x[0], reverse=True)[:10]

[(4.434039303811506, 'character'),
 (2.2224827249064756, 'disgust'),
 (1.5713187667199533, 'Americans'),
 (1.3312580030438101, 'thanks'),
 (1.224293057473477, 'time'),
 (1.2216042992555196, 'People'),
 (1.2177627751945206, 'better win'),
 (1.1430301520268586, 'weed'),
 (1.130134091345457, 'Joe Biden'),
 (1.1005211217166224, 'lot')]

In [70]:
svm_yang = SVC(C=4.0, kernel='linear') # rbf -> .86, linear -> 0.85
svm_yang.fit(X_train_yang, y_train_yang)
svm_yang.score(X_test_yang, y_test_yang)
f1_score(svm_yang.predict(X_test_yang), y_test_yang)
f1_score(svm_yang.predict(X_train_yang), y_train_yang)

0.9742813918305598

In [62]:
# Dump and load to pickle file.
with open('Predictions/yang_LR.pk', 'wb') as file:
    pickle.dump(lr_yang, file)
with open('Predictions/yang_SVM.pk', 'wb') as file:
    pickle.dump(svm_yang, file)

with open('Predictions/yang_LR.pk', 'rb') as file: 
    lr_yang = pickle.load(file)    
with open('Predictions/yang_SVM.pk', 'rb') as file:    
    svm_yang = pickle.load(file)