In [None]:
import sys
!{sys.executable} -m pip install tweepy==3.9.0 pymongo

In [None]:

import time
import torch
import tweepy
import swifter
import pandas as pd
from Scripts import Data2Cox
from Scripts import DefThreshold
from Scripts import MongodbInit
from Scripts import DataCollection
from Scripts import ApplyClassifier
from Scripts import ChooseTransformer
from Scripts import TimelineCollection
from Scripts import TimelineSimilarities
from Scripts import TimelinePreprocessing
from Scripts import TrainPersonalClassifier
import torch.nn.functional as F
from lifelines import CoxPHFitter
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from nltk.tokenize import TweetTokenizer
from transformers import TrainingArguments, AutoModelForSequenceClassification

## Connection to mongodb

In [None]:
initMongo = MongodbInit.MongoInitialisation()
db = initMongo.mongodb_connection()

## Initialisation and data collection based of keywords

In [None]:
#Initialize connexion to Twitter using the tokens generated on Twitter API and filled in the config file
initCollection = DataCollection.InitialisationTwitter()
auth = initCollection.connection_to_twitter()

#Load keywords filled in the config file
keywords = initCollection.keywords

#Connect to the API and collect tweets based on the list of keywords
api = tweepy.API(wait_on_rate_limit=True)
streamlistener = DataCollection.StreamListener(db, auth, api)
streamer = tweepy.Stream(auth=auth, listener=streamlistener)
time.sleep(2)
print("Tracking: " + str(keywords))
while True: #this will keep collecting tweets while the cell is running, you'll have to stop it or set a threshold to collect a limited number of tweets
    streamer.filter(track=keywords, stall_warnings = True)

## Train personal / own diabetes classifier using labelled data

In [None]:
#Load the dataset with tweets you have manually labeled to identify users tweeting about their own experience
dataset = pd.read_csv("Own_diabetes_manually_labeled_tweets.csv")
dataset = dataset.rename(columns={"labels":"label"}) #the column with the labels has to be called label for the next step
initTrain = TrainPersonalClassifier.DataToTrain(TweetTokenizer)

#Prepare encodings for training
train_encodings, train_labels, val_encodings, val_labels, test_encodings, test_labels = initTrain.PrepareData(dataset)
train_dataset = TrainPersonalClassifier.TweetDataSet(train_encodings, train_labels)
val_dataset = TrainPersonalClassifier.TweetDataSet(val_encodings, val_labels)
test_dataset = TrainPersonalClassifier.TweetDataSet(test_encodings, test_labels)
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

#These arguments will be used, it is possible to modify them
CUDA_LAUNCH_BLOCKING=1
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10
)

#Pretrained HF model used that will be fine tuned
model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base")

#Training
modeltrainer = TrainPersonalClassifier.ModelTrainer()
trainer = modeltrainer.trainer(training_args, model, train_dataset, val_dataset)
trainer.train()
eval_output = trainer.evaluate(test_dataset)

#This will print the model performances
print(eval_output)

#Save the model
trainer.save_model("Personal_experience_model")

## Apply classifier to collected tweets

In [None]:
# Load collected tweets into a pandas dataframe
tweets = pd.DataFrame(db.tweets_collection.find())

# Initialize data object to preprocess tweets
init_data = ApplyClassifier.Data(TweetTokenizer)

# Determine device for model (GPU if available, else CPU)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("device: {}".format(device))

# Load tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
model = AutoModelForSequenceClassification.from_pretrained("Personal_experience_model").to(device)

# If model was fine-tuned on english tweets only, filter tweets to only include English tweets
tweets = tweets[tweets["lang"] == "en"]
print(tweets.shape)

# Encode tweet text using the tokenizer, truncating and padding the sequences as necessary
# => allows to feed batches of sequences 
tweets_encodings = tokenizer(tweets.text.map(init_data.normalizeTweet).values.tolist(), truncation=True, padding=True, return_tensors="pt").to(device)

# Create a dataset object from the encoded tweets
tweetDataSet = ApplyClassifier.TweetDataSet(tweets_encodings)

# Set model to evaluation mode and create a dataloader for the tweet dataset
model.eval()
tweetsLoader = DataLoader(tweetDataSet, batch_size=32)
print("len tweetsLoader: {}".format(len(tweetsLoader)))

# Predict the label for each tweet using the model and append the predicted labels to a pandas series
predicted = pd.Series()
for (i, batch) in enumerate(tweetsLoader): 
    if i % 2000 == 0 : print(i)
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    proba = F.softmax(outputs[0]).detach().cpu().numpy()  # get probabilities from output
    predicted_labels = pd.DataFrame(proba).apply(tweetDataSet.proba_to_category, axis=1) # get predicted class (highest proba)
    predicted = predicted.append(predicted_labels, ignore_index=True)

# Print the counts of each label and filter the tweets to only include those where the predicted label is 1 (personal experience)
print("predicted: {}".format(predicted.shape))
print(predicted.value_counts())
tweets["personal"] = predicted.values
tweets_personal = tweets[tweets["personal"] == 1] #Keep only the ones where users talk about their own diabetes
print("personal tweets: {}".format(tweets_personal.shape))

# Drop unnecessary columns and reorder columns for output
tweets_personal = tweets_personal.drop(["_id"], axis=1)
tweets_personal = tweets_personal[["id","user"]]

# Write personal tweets to a CSV file
#tweets_personal.to_csv("personal_tweets.csv")


## Timelines collection of identified users

In [None]:
#Load class to collect timelines
api_timeline = tweepy.API(auth)
timelinecol = TimelineCollection.TimelineCollection(db, api_timeline)

In [None]:
#Check ids that still exist
list_ids = [tweet["user"]["id"] for index, tweet in tweets_personal.iterrows()]
list_existingids = [userid for userid in list_ids if timelinecol.exists(userid)]
print(len(list_existingids))

#Collect user ids still active/existing, timelines will be stored in the test_timeline collection in mongodb
timelinecol.collect_timelines(list_existingids)

## Preprocessing of users timelines

In [None]:
# Define keywords that need to be matched in tweets for preprocessing
keywords_to_match = ["t1d", "t2d", "insulin", "gbdoc", "dsma","type 1", "type 2", "diabetic", "diabete", "blood glucose", "fingerprick"]

#Key/main concept
dict_key_concept = {"Diabetes" : "diabetes"}

# Initialize a TimelinePreprocessing object with the keyword-to-concept mapping and the list of keywords to match
init_preprocess = TimelinePreprocessing.PreprocessTimeline(dict_key_concept, keywords_to_match)

#List of ids of timelines that were collected
list_ids = list(db.test_timeline.distinct("user.id"))

# Loop through the list of user ids and preprocess each user's timeline
for userid in tqdm(list_ids):
    
    # Load the user's timeline from the "test_timeline" collection
    timeline = pd.DataFrame(db.test_timeline.find({"user.id": userid}))
    
    # Get the full text of each tweet in the timeline and add it to a new "text" column
    timeline["text"] = timeline.apply(lambda x: init_preprocess.get_full_text(x), axis=1)
    
    # Remove URLs, RTs, and mentions from the tweet text
    timeline = init_preprocess.remove_url_rts_mentions(timeline)
    
    # Remove tweets that have less than 7 non-stopwords
    timeline = init_preprocess.remove_empty_tweets(timeline, 7)
    
    # Translate non-English tweets to English
    timeline = init_preprocess.format_date(timeline)

    # Remove contractions 
    timeline["prep"] = timeline.apply(lambda x: init_preprocess.remove_contractions(x), axis=1)

    # Save timeline in new collection
    db.test_prep.insert_many(timeline.to_dict('records'))

## Try different transformer models and keep the best one

In [None]:
#Chose best transformer and test it on 1000 tweets to decide which one is the best
df = pd.DataFrame(db.test_prep.find()[:1000])
available_models = ["all-mpnet-base-v2"]#'stsb-mpnet-base-v2','stsb-roberta-base-v2','stsb-distilroberta-base-v2', 'nli-mpnet-base-v2', 
#'stsb-roberta-large', 'nli-roberta-base-v2', 'stsb-roberta-base', 'stsb-bert-large','stsb-distilbert-base', 
#'stsb-bert-base', 'nli-distilroberta-base-v2', 'paraphrase-xlm-r-multilingual-v1', 'paraphrase-distilroberta-base-v1', 'nli-bert-large', 
#'nli-distilbert-base', 'nli-roberta-large', 'nli-bert-large-max-pooling', 'nli-bert-large-cls-pooling', 'nli-distilbert-base-max-pooling', 'nli-roberta-base',
# 'nli-bert-base-max-pooling', 'nli-bert-base', 'nli-bert-base-cls-pooling', 'average_word_embeddings_glove.6B.300d', 'average_word_embeddings_komninos', 
# 'average_word_embeddings_levy_dependency', 'average_word_embeddings_glove.840B.300d',"sentence-t5-base" #more can be added

modeltest = ChooseTransformer.TestModel("dict_concepts_keywords.txt")

modeltest.try_models(df, available_models) #check values
#Values shouldn't always be very high or very low. 
#Check if  related tweets have higher similarities for some concepts than others.

## Define the different threshold from preprocessed tweets using choosen model

In [None]:
#Take 20000 tweets from the preprocessed timelines and apply similarities between tweets and the concepts
#This will return the highest similarities and you will have to screen to decide when the score is enough or not
threshold_df = pd.DataFrame(db.test_prep.find()[:20000])
init_threshold = DefThreshold.DefineThreshold(threshold_df, "all-mpnet-base-v2")
init_threshold.apply_similarities()

## End preprocessing by defining t0 for each user and deleting everything that was published before

In [None]:
#t0 is the first time the user is tweeting about the disease/topic of interest
#Here, it is the first time the user tweets about diabetes
#For each timeline, we check when is the first time the threshold regarding diabetes is exceeded or the first time a diabetes 
#related keyword is used
list_ids = list(db.test_prep.distinct("user.id"))
for userid in list_ids :
    timeline = pd.DataFrame(db.test_prep.find({"user.id" : userid}))
    timeline = init_preprocess.define_t0(timeline, 0.33, "all-mpnet-base-v2")
    try:
        db.test_prep_2.insert_many(timeline.to_dict('records'))
    except:
        pass

## Apply similarities between all timelines and am exposure/outcome couple concepts and prepare Cohort-like table

In [None]:
#Select an exposure and an outcome in the list of concepts
exposure = "Comorbidities"
outcome = "Mental health"
#Load ids of remaining preprocessed timelines
list_ids = db.test_prep_2.distinct("user.id")
#Apply similarities between tweets and an outcome and an exposure
SimInit = TimelineSimilarities.Similarities(db, "all-mpnet-base-v2", list_ids)

#Save similarities in new mongodb collection
df_cohort = SimInit.apply_similarities(exposure, outcome)

In [None]:
#Load ids and for each timeline check if the outcome and exposure previously filled appear
list_ids = list(set(df_cohort.user.tolist()))
init_cox = Data2Cox.PrepareCox(db, list_ids, CoxPHFitter)
df_cohort_prep = init_cox.prepare_data(df_cohort)
print(df_cohort_prep.head())

In [None]:
#Cox analysis and prints results
init_cox.cox(df_cohort_prep)