In [84]:
import spacy
import os
import json
import pandas as pd 
import re
from datetime import datetime
import numpy as np

model_path = os.path.join('..', 'models', 'tpb_labeller', 'model-best')

raw_n = 'tpb_tweets_filtered_20220328.json'
in_n = 'tpb_contextcat_20220523.jsonl'

# columns names for labels
old_label_cols = ['Physical stuckness and Covid', 'Pandemic precarity', 'Blocked and derailed mobilities because of Covid', 'New mobilities in relation to Covid', 'Other']
new_label_cols = ['Physical stuckness and Covid', 'Pandemic precarity', 'Blocked and derailed mobilities in relation to Covid', 'Mobility in relation to Covid ', 'Other']

label_rename_map = dict(zip(old_label_cols, new_label_cols))

data_p = os.path.join('..', 'data')

data_raw_p = os.path.join(data_p, raw_n)
data_in_p = os.path.join(data_p, in_n)

nlp = spacy.load(model_path)

In [85]:
# loading data
with open(data_raw_p, 'r') as f:
    data = json.load(f)
    
df_raw = pd.DataFrame.from_records(data).drop_duplicates('id')

with open(data_in_p, 'r') as f:
    data = [json.loads(line) for line in f]

df_in = pd.DataFrame.from_records(data)

# recreate id from link
id_regex = re.compile(r'(?<=/)(\d{15,20})$')
df_in['id'] = df_in['tweet_link'].str.extract(id_regex).astype('int')

# tidy - one row per label
df_in = df_in.explode('accept').reset_index(drop = True)

In [86]:
df_in.head()

Unnamed: 0,id,username,text,tweet_link,_input_hash,_task_hash,options,_view_id,config,accept,answer,_timestamp,_annotator_id,_session_id
0,1468279014120845313,AJEnglish,Part of one of the world’s oldest surviving wo...,https://twitter.com/AJEnglish/status/146827901...,-1538745346,368596328,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},,ignore,1650632594,tpb_contextcat-eva,tpb_contextcat-eva
1,1468225220125331467,AJEnglish,"""Funds from the EU and member states, sometime...",https://twitter.com/AJEnglish/status/146822522...,1669483179,2076458894,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},The backdrop of physical stuckness,accept,1650632638,tpb_contextcat-eva,tpb_contextcat-eva
2,1468225220125331467,AJEnglish,"""Funds from the EU and member states, sometime...",https://twitter.com/AJEnglish/status/146822522...,1669483179,2076458894,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},Context of blocked and derailed mobilities,accept,1650632638,tpb_contextcat-eva,tpb_contextcat-eva
3,1468194076738600971,InfoMigrants,The Council of Europe has decided to shelve a ...,https://twitter.com/InfoMigrants/status/146819...,54907186,-26033530,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},Context of blocked and derailed mobilities,accept,1650632654,tpb_contextcat-eva,tpb_contextcat-eva
4,1468139466946535425,AJEnglish,Libyan authorities get support from the EU to ...,https://twitter.com/AJEnglish/status/146813946...,-1780393086,1232472989,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},Context of blocked and derailed mobilities,accept,1650632663,tpb_contextcat-eva,tpb_contextcat-eva


In [87]:
# indicator for part of training of model or not (trained on april 29th 2022 - NOTE: Unknown whether part of train or test set)

filter_timestamp = datetime.timestamp(datetime.fromisoformat('2022-04-29T23:59:59'))

df_in['training'] = 0
df_in.loc[df_in['_timestamp'] < filter_timestamp, 'training'] = 1 

In [88]:
def predict_cat(text):
    doc = nlp(text)
    return_predict = {}
    
    predicted_cats = doc.cats
    
    max_cat_score = max(predicted_cats.values())
    
    return_predict['predict_cat'] = max(predicted_cats, key = predicted_cats.get)
    return_predict['predict_score'] = max_cat_score
    
    return(return_predict)

In [54]:
predict_cat(data[9].get('text'))

{'predict_cat': 'physical stuckness', 'predict_score': 0.5388820171356201}

In [89]:
df_in = pd.merge(df_in, pd.json_normalize(df_in['text'].apply(predict_cat)), how = 'left', left_index = True, right_index = True)
df_in.loc[df_in['accept'].isna(), ['predict_cat', 'predict_score']] = np.nan

In [90]:
df_in.head()

Unnamed: 0,id,username,text,tweet_link,_input_hash,_task_hash,options,_view_id,config,accept,answer,_timestamp,_annotator_id,_session_id,training,predict_cat,predict_score
0,1468279014120845313,AJEnglish,Part of one of the world’s oldest surviving wo...,https://twitter.com/AJEnglish/status/146827901...,-1538745346,368596328,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},,ignore,1650632594,tpb_contextcat-eva,tpb_contextcat-eva,1,,
1,1468225220125331467,AJEnglish,"""Funds from the EU and member states, sometime...",https://twitter.com/AJEnglish/status/146822522...,1669483179,2076458894,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},The backdrop of physical stuckness,accept,1650632638,tpb_contextcat-eva,tpb_contextcat-eva,1,The backdrop of physical stuckness,0.975078
2,1468225220125331467,AJEnglish,"""Funds from the EU and member states, sometime...",https://twitter.com/AJEnglish/status/146822522...,1669483179,2076458894,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},Context of blocked and derailed mobilities,accept,1650632638,tpb_contextcat-eva,tpb_contextcat-eva,1,The backdrop of physical stuckness,0.975078
3,1468194076738600971,InfoMigrants,The Council of Europe has decided to shelve a ...,https://twitter.com/InfoMigrants/status/146819...,54907186,-26033530,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},Context of blocked and derailed mobilities,accept,1650632654,tpb_contextcat-eva,tpb_contextcat-eva,1,Context of blocked and derailed mobilities,0.977045
4,1468139466946535425,AJEnglish,Libyan authorities get support from the EU to ...,https://twitter.com/AJEnglish/status/146813946...,-1780393086,1232472989,"[{'id': 'The backdrop of physical stuckness', ...",choice,{'choice_style': 'multiple'},Context of blocked and derailed mobilities,accept,1650632663,tpb_contextcat-eva,tpb_contextcat-eva,1,Context of blocked and derailed mobilities,0.978058


In [104]:
df_select = df_in.loc[(df_in['training'] == 0) & (df_in['predict_score'] >= 0.5), ]

In [105]:
df_select['predict_cat'].value_counts()

Existing precarity and vulnerabilities        287
New mobilities and migratory routes           223
Context of blocked and derailed mobilities    164
Other                                          37
The backdrop of physical stuckness             22
Name: predict_cat, dtype: int64

In [106]:
df_select.groupby('predict_cat')['predict_score'].mean()

predict_cat
Context of blocked and derailed mobilities    0.715056
Existing precarity and vulnerabilities        0.722019
New mobilities and migratory routes           0.756711
Other                                         0.655290
The backdrop of physical stuckness            0.680670
Name: predict_score, dtype: float64

In [107]:
df_select['accept'].value_counts()

Existing precarity and vulnerabilities        228
Context of blocked and derailed mobilities    167
New mobilities and migratory routes           159
The backdrop of physical stuckness             92
Other                                          87
Name: accept, dtype: int64