-
Notifications
You must be signed in to change notification settings - Fork 20
/
retweet_probability.py
81 lines (68 loc) · 4.29 KB
/
retweet_probability.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import json
import os
import math
import random
from relationship_trainer import RelationshipTrainer
# The proabilities are loaded into memory on start
filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), RelationshipTrainer.TRAINING_RESULTS_FILENAME)
all_probabilities = {}
with open(filepath, 'r') as file:
all_probabilities = json.load(file)
file.close()
# This function returns the probability a given user will retweet a given tweet using
# the formula for Naive Bayes Classifiers
def retweet_probability(username, tweet):
probabilities = all_probabilities.get(username, {
# we assume our totals are 1 so that we never end up dividing by zero
'total_positive': 1,
'total_negative': 1,
'total': 1,
'words': {}
})
tokens = RelationshipTrainer.tknzr.tokenize(tweet)
# we assume our totals are 1 so that we never end up dividing by 0
total_positive_token_occurences = float(probabilities['total_positive'] or 1)
total_negative_token_occurences = float(probabilities['total_negative'] or 1)
total_token_occurences = float(probabilities['total'] or 1)
probability_class_is_positive = float(total_positive_token_occurences/total_token_occurences)
probability_class_is_negative = float(total_negative_token_occurences/total_token_occurences)
positive_prob = probability_class_is_positive
negative_prob = probability_class_is_negative
token_specific_calculations = {}
for token in tokens:
token_counts = probabilities['words'].get(token, {'positive': 0, 'negative': 0, 'total': 0})
this_token_total_occurences = float(token_counts.get('total'))
# if we don't have any positive token occurences, we use the default probability a class is positive so that our
# probabilities aren't sent to 0
this_token_positive_occurences = float(token_counts.get('positive') or probability_class_is_positive)
probability_this_token_positive = float(this_token_positive_occurences / total_positive_token_occurences)
positive_prob = float(positive_prob * probability_this_token_positive)
# if we don't have any negative token occurences, we use the default probability a class is negative so that our
# probabilities aren't sent to 0
this_token_negative_occurences = float(token_counts.get('negative') or probability_class_is_negative)
probability_this_token_negative = float(this_token_negative_occurences / total_negative_token_occurences)
negative_prob = float(negative_prob * probability_this_token_negative)
token_specific_calculations[token] = {
'probability_is_positive': probability_this_token_positive,
'probability_is_negative': probability_this_token_negative,
'total_occurences': this_token_total_occurences
}
retweet_probability_average = float(probability_class_is_positive / (probability_class_is_positive + probability_class_is_negative))
retweet_probability = float(positive_prob / ((negative_prob + positive_prob) or 1))
return {
# this term indicates the output for the naive bayesian classification as a positive class
'positive': positive_prob,
# this term indicates the output for the naive bayesian classification as a negative class
'negative': negative_prob,
# this term indicates the average probability of a tweet being retweeted, regardless of content
'retweet_probability_average': retweet_probability_average,
# this term indicates the probability of this specific tweet being retweeted
'retweet_probability': retweet_probability,
# this term indicates how much more likely a tweet is to be retweeted than the average tweet
'multiplier': float(retweet_probability / retweet_probability_average),
# this term centers the multiplier around 0 so that a multiplier that does nothing (ie, is 1) is 0,
# those that increase the probability are positive, and those that decrease the probability are negative
'factor_scale': float((retweet_probability / retweet_probability_average) - (retweet_probability_average / retweet_probability)),
# this term returns the probabilities for the specific tokens present in the tweet
'token_specific_calculationss': token_specific_calculations
}