In [1]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [2]:
%cd "drive/My Drive/outrageclf/"

/content/drive/My Drive/outrageclf


In [3]:
!python3 setup.py install

running install
running bdist_egg
running egg_info
writing outrageclf.egg-info/PKG-INFO
writing dependency_links to outrageclf.egg-info/dependency_links.txt
writing requirements to outrageclf.egg-info/requires.txt
writing top-level names to outrageclf.egg-info/top_level.txt
reading manifest file 'outrageclf.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'outrageclf.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
copying outrageclf/helpers.py -> build/lib/outrageclf
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/outrageclf
copying build/lib/outrageclf/__init__.py -> build/bdist.linux-x86_64/egg/outrageclf
copying build/lib/outrageclf/helpers.py -> build/bdist.linux-x86_64/egg/outrageclf
copying build/lib/outrageclf/model_architect.py -> build/bdist.linux-x86_64/egg/outrageclf
copying build/lib/outrageclf/classifier.py -> build/bdist.linux-x86_64/egg/outrag

**Running the wrapper function**

In [10]:
# an joblib embedding file and a model file is required
# contact the Crockett lab for these model files
embedding_url = "/31k.joblib"
model_url = "/31k.h5"

In [11]:
# these tweets are created purely for demostration
# they are not part of, or represent any tweets in the actual training data
tweets = [
          "This topic infuriates me because it violates my moral stance",
          "This is just a super-normal topic #normal",
          "The type of football they play today is atrocious"
          ]

In [12]:
from outrageclf.classifier import pretrained_model_predict
pretrained_model_predict(tweets, embedding_url, model_url)

Loaded pre-trained tokenizer at: 31k.joblib
Loaded pretrained model at: 31k.h5


array([[9.9660861e-01],
       [4.0077552e-04],
       [6.3920277e-01]], dtype=float32)

**A peak into the model**

This section gives you a closer look at every steps under `pretrained_model_predict`

In [13]:
from outrageclf.preprocessing import WordEmbed, get_lemmatize_hashtag
from outrageclf.classifier import _load_crockett_model

In [14]:
# loading our pre-trained models
word_embed = WordEmbed()
word_embed._get_pretrained_tokenizer(embedding_url)
model = _load_crockett_model(model_url)

Loaded pre-trained tokenizer at: 31k.joblib


In [15]:
# the text are lemmatized and embedded into 50-d space
lemmatized_text = get_lemmatize_hashtag(tweets)
embedded_vector = word_embed._get_embedded_vector(lemmatized_text)

In [16]:
for idx, tweet in enumerate(tweets):
  print("Original tweet:", tweet)
  print("Lemmatize text:", lemmatized_text[idx])
  print("50-d embedded vector:", embedded_vector[idx])

Original tweet: This topic infuriates me because it violates my moral stance
Lemmatize text: topic infuriate violate moral stance 
50-d embedded vector: [1760 2401 1705  611 3121    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
Original tweet: This is just a super-normal topic #normal
Lemmatize text: super normal topic #normal
50-d embedded vector: [1427 2033 1760 2033    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
Original tweet: The type of football they play today is atrocious
Lemmatize text: type football play today atrocious 
50-d embedded vector: [ 958 2308  250   93 3486    0    0    0    0    0    0    0    0    0
   

In [17]:
# the model then makes prediction using the embedded_vector as inputs
predict = model.predict(embedded_vector)

In [18]:
for idx, tweet in enumerate(tweets):
  print("Original tweet:", tweet)
  print("Predicted probability of outrage:", predict[idx])
  print("\n")

Original tweet: This topic infuriates me because it violates my moral stance
Predicted probability of outrage: [0.9966086]


Original tweet: This is just a super-normal topic #normal
Predicted probability of outrage: [0.00040078]


Original tweet: The type of football they play today is atrocious
Predicted probability of outrage: [0.6392028]


