In [1]:
# This notebook shows how the geo-tagged tweets were extracted and how the test and training
# files for the models were created

# First we will extract users with geo-tagged tweets and analyse them
import json
import csv
import reverse_geocoder as rg


# Path to hydrated TweetsCOV19
pathHydrated = "C:/Users/dennis/Desktop/hydrated.jsonl"

data ={}
dUserCoordinates ={}
countGeoTag=0
with open(pathHydrated,encoding="utf8",errors='ignore') as hydratedJSONL:
    count = 0
    for raw in hydratedJSONL:
        tweet = json.loads(raw)
        # We will consider the id_str of the use, instead one can take the username feature "screen_name"
        user = tweet["user"]["id_str"]
       
        #We will prioritize the "geo" feature
        geo = tweet["geo"]
        if geo != None:
            coordinates = tweet["geo"]["coordinates"]
            countGeoTag+=1

            # Created dict with user: (coordinates)
            if user not in dUserCoordinates.keys():
                dUserCoordinates[user]=[]
                dUserCoordinates[user].append(tuple(coordinates))
            else:
                dUserCoordinates[user].append(tuple(coordinates))
                       
            #Count Number GeoTagged Tweets per User
            if user not in data.keys():
                data[user]=1                
                continue
            else:
                data[user]+=1
                continue
                
        # Else we will consider the "place" feauture       
        if (tweet["place"]!=None) and geo == None  :
            if tweet["place"]["bounding_box"] != None:
                countGeoTag+=1
                #Coordinates have to be swapped from long,lat -> lat,long
                coordinates = list(tweet["place"]["bounding_box"]["coordinates"][0][0])
                coordinates = (coordinates[1],coordinates[0])
                
                # Created dict with user: (coordinates)
                if user not in dUserCoordinates.keys():
                    dUserCoordinates[user]=[]
                    dUserCoordinates[user].append(tuple(coordinates))
                else:
                    dUserCoordinates[user].append(tuple(coordinates))
                
                #Count Number GeoTagged Tweets per User
                if user not in data.keys():
                    data[user]=1
                    continue
                else:
                    data[user]+=1
                    continue

In [None]:
# Analyse how many users we have with multiple tweets and the breakdown regarding their number of tweets

import math

count=0
maxValue =0
for x,value in data.items():
    if value > maxValue:
        maxValue = value
    if value >1:
        count +=1

gt300=0
gt100=0
gt50=0
gt10=0
gt5=0
gt1=0

for y in data.values():

    if y>300:
        gt300=gt300+1
    elif y>100:
        gt100=gt100+1
    elif y>50:
        gt50=gt50+1
    elif y>10:
        gt10=gt10+1
    elif y>5:
        gt5=gt5+1
    elif y>1:
        gt1=gt1+1
    elif y==1:
        eq1+=1


print("GeoTagged Tweets")
print("Number of Tweets : "+str(countGeoTag))
print("Number of Users : "+str(len(data)))
print("Number of Users with multiple  Tweets: "+str(count))
print("Max count of Tweets for a single User: "+str(maxValue)+"\n")

print("Distribution of User")
print("User     >    Tweets")
print(str(gt300) +" > 300")
print(str(gt100) +" > 100")
print(str(gt50) +" > 50")
print(str(gt10) +" > 10")
print(str(gt5) +" > 5")
print(str(gt1) +" > 1")

In [12]:
# Method to calculate most frequent city for ground truth data ( mobile users)
def most_frequent(listCity): 
    counter = 0
    freqCity = listCity[0] 
    index = 0
    count = 0
    for i in listCity:
        curr_frequency = listCity.count(i) 
        if(curr_frequency> counter): 
            counter = curr_frequency 
            freqCity = i
            index = count
        count+=1  
        
    return freqCity,index

In [None]:
# This Cell shows how ground truth was selected for users.

from geopy.distance import distance
import reverse_geocoder as rg

userLabel={}
countUserDifCoord=0
dMultiCountry ={}
dMultiTweet={}
length = len(dUserCoordinates.keys())

# Groundtruth for user with single tweets or tweets with equal coordinates
for user,value in dUserCoordinates.items():
    if len(value)==1 or all(elem == value[0] for elem in value):
        userLabel[user]=value[0]
                
# Groundtruth data for users with different multiple tweet coordinates
dMultiTweet=[]
for user,value in dUserCoordinates.items():
    if len(value)>1 and not all(elem == value[0] for elem in value):
        distances=[]
        
        if count%100==0:
            print(count)
        count+=1
        
        #Primary Location is coordinate from the city where the majority of tweets were from     
        #Reverse geoCoding
        results = rg.search(value)
        cities={}
        
        for x in range(len(value)):
            location=results[x]
            city=location["name"]
            cities[value[x]]=city
        
        #Get most frequent city and index 
        fixCity,index = most_frequent(list(cities.values()))
        userLabel[user]=value[index]

        #Continue if all geocoordinates are the same city 
        if all(elem == fixCity for x,elem in cities.items()):
            userLabel[user]=value[0]
            continue
        
        #Calculate distance if cities are not equal
        for x,y in cities.items():
            if y == fixCity:
                continue
            distances.append(round(distance(x,value[index]).km))
        
        #Calculate avgDistance 
        avgDistance = int(sum(distances)/len(distances))
        dMultiTweet.append(avgDistance)

In [None]:
# This Cell analysis mobile users

lDistances = sorted(dMultiTweet)
length=len(lDistances)
index=int(length/2)

print("Number of Distances  is "+str(len(lDistances)))
avg  = int ( sum(lDistances)/len(lDistances))
print("Average is "+str(avg)+"km")
median = lDistances[index]
print("Median is "+str(median)+"km")
mostFreqDistance, counter = most_frequent(lDistances)
print("Most Frequent distance is "+str(mostFreqDistance)+"km, "+str(counter)+" times ")

In [19]:
#Write GroundTruth Data for users to file

# First we will remove brackets "(" and ")" from the groundtruth coordinates
newUserLabel={}
for x,y in userLabel.items():
    newCoord = str(y).replace("(","")
    newCoord = newCoord.replace(")","")
    newCoord = newCoord.replace(",","")
    
    newUserLabel[x]=newCoord

# Path to save groundtruth data to CSV   
userLabel ="C:/Users/dennis/Desktop/dataTestTrain/userLabel.csv"
with open(userLabel, 'w', newline="",encoding="utf-8")as file:
    writer = csv.writer(file, delimiter =";")
    for x,y in newUserLabel.items():     
        writer.writerow([x,y])

In [None]:
# Split user ids which have geotagged tweets in random training / test sets. 80/20 %
import numpy as np
from sklearn.model_selection import train_test_split

train , test = train_test_split(data, test_size=0.2, random_state = 42)


# Write test / train user ids in csv
# Path to save the user ids into CSV for training or test dataset
trainFile ="C:/Users/dennis/Desktop/dataTestTrain/trainUser.csv"
testFile ="C:/Users/dennis/Desktop/dataTestTrain/testUser.csv"

# Train users
with open(trainFile, 'a', newline="",encoding="utf-8")as trainFile:
    writer = csv.writer(trainFile, delimiter =";")
    for id in train:     
        writer.writerow([id])
# Test Users
with open(testFile, 'a', newline="",encoding="utf-8")as testFile:
    writer = csv.writer(testFile, delimiter =";")
    for id in test:
        writer.writerow([id])

In [9]:
# IMPORTANT FOR GeoLocation 2400WORLD DATA FILE
# GeoLocation needs a file containing coordinates which matches the users ground truth
# to create the spatial grid for GeoLoc text.
# Instead of the default file, we need to use the coordinates from our ground truth

import csv

# Path to ground truth data
pathLabels ="C:/Users/dennis/Desktop/dataTestTrain/userLabel.csv"

#We will extract only the coordinates
trainingLabel=[]
with open(pathLabels,encoding="utf8",errors='ignore') as file:
    for x in file:
        data = x.split("\t")
        coord=data[1]
        coord=coord.replace(" ",",")
        trainingLabel.append(coord)     

# Path to save the coordinates
pathCoordinates = "C:/Users/dennis/Desktop/dataTestTrain/2400_median_clustered_world.train"
with open(pathCoordinates, 'w', newline="",encoding="utf-8")as trainingFile:
    writer = csv.writer(trainingFile, delimiter ="\t",escapechar=' ', quoting=csv.QUOTE_NONE)
    for x in trainingLabel:     
        writer.writerow([x.replace('\n'," ")])

In [None]:
############## Creating Train / Test input datasets of users in format for GeoLocation ##############################

import json
import csv
import reverse_geocoder as rg

#"train" or "test"
usage="train"

# Path for hydrated TweetsCOV19
pathHydrated = "C:/Users/dennis/Desktop/hydrated.jsonl"

# Path for CSV with training User ids or testing user ids
pathUsers="C:/Users/dennis/Desktop/dataTestTrain/"+usage+"User.csv"
users=[]
with open(pathUsers,encoding="utf8",errors='ignore') as file:
    for x in file:
        user = x.replace("\n","")
        users.append(user)

with open(pathHydrated,encoding="utf8",errors='ignore') as hydratedJSONL:
    labels ={}
    for raw in hydratedJSONL:
        tweet = json.loads(raw)
        user = tweet["user"]["id_str"]
        text = tweet["full_text"]
        
        # Check every tweet if user is in the selected dataset and if tweet is geo-tagged
        # We will concatenate the tweet message if the user is in dataset and the tweet geo-tagged. As coordinates
        # we use the most frequent city coordinates in variable newUserLabel (from 3 Cells above ) for the user
        if (tweet["place"]) == None and (tweet["geo"])==None:
            continue
        if user not in users:
            continue
                       
        else:
            geo = tweet["geo"]
            if geo != None:
                if user not in labels.keys():
                    coord=newUserLabel[user]
                    coord=coord.split(" ")
                    labels[user]=user+"\t"+coord[0]+"\t"+coord[1]+"\t"+" ||| "+text
                    continue
                else:
                    labels[user]=labels[user]+" ||| "+text
                    continue
               
            if (tweet["place"]!=None) and geo == None  :
                if tweet["place"]["bounding_box"] != None:
                    if user not in labels.keys():
                        coord=newUserLabel[user]
                        coord=coord.split(" ")
                        labels[user]=user+"\t"+coord[0]+"\t"+coord[1]+"\t"+" ||| "+text
                        continue
                    else:
                        labels[user]=labels[user]+" ||| "+text
                        continue

                        
# Path to save the dataset
pathDataset="C:/Users/dennis/Desktop/dataTestTrain/geolocationUser."+usage
with open(pathDataset, 'w', newline="",encoding="utf-8")as new:
    writer = csv.writer(new, delimiter ="\t",escapechar=' ', quoting=csv.QUOTE_NONE)   
    for line in labels.values():
        writer.writerow([line.replace("\n","")])

In [None]:
# First we will select ground truth data for DeepGeo and create the corresponsing file with coordinates

import json,csv

# Path for hydrated TweetsCOV19
pathHydrated = "C:/Users/dennis/Desktop/hydrated.jsonl"


users=[]
groundTruthUser={}

# Path for csv with userid's which are in training split
pathTrainUser="C:/Users/dennis/Desktop/dataTestTrain/trainUser.csv"
with open(fileTrainUser,encoding="utf8",errors='ignore') as file:
    for x in file:
        user = x.replace("\n","")
        users.append(user)

with open(hydrated,encoding="utf8",errors='ignore') as hydratedJSONL:

    print("Starting Reading JSON File")
    for raw in hydratedJSONL:
        tweet = json.loads(raw)
        tweetId = tweet["id_str"]
        user = tweet["user"]["id_str"]
        if (tweet["place"]) == None and (tweet["geo"])==None:
            continue
        if user not in users:
            continue
                       
        else:
            # Parameter Geo
            geo = tweet["geo"]
            if geo != None:
                coordinates = tweet["geo"]["coordinates"]
                groundTruthTweet[tweetId]=coordinates
                
            # Parameter Place   
            if (tweet["place"]!=None) and geo == None  :
                if tweet["place"]["bounding_box"] != None:
                    coordinates = list(tweet["place"]["bounding_box"]["coordinates"][0][0])
                    coordinates = (coordinates[1],coordinates[0])
                    groundTruthTweet[tweetId]= coordinates  
                    
                    
# DeepGeo models accepts ground truth data for training in a seperate file with following form
#{"tweet_id": "547240490022617088", "tweet_city": "omaha-ne055-us", 
#"tweet_latitude": "41.782947", "tweet_longitude": "-95.287009"}
# Since we are using geo-tagged tweets we will also put the ground truth coordinates in the field "tweet_city"

# Path for ground truth input data to train deepGeo models
pathGroundTruth = "C:/Users/dennis/Desktop/dataTestTrain/deepgeo/validTweet/label.tweet.json"
with open(pathGroundTruth, 'a') as file:
    for tweetId,coord in groundTruthTweet.items():
        data = {}
        lat=coord[0]
        long=coord[1]
        
        data["tweet_id"]=tweetId
        data["tweet_city"]=str(lat)+","+str(long)
        data["tweet_latitude"]=str(lat)
        data["tweet_longitude"]=str(long)
        
        json.dump(data,file)
        file.write('\n')

In [None]:
# Extracting input data with needed features from metadata for training and testing datasets

import json,csv

# Select dataset
usage="test"

#Path hydrated TweetsCOV19
pathHydrated = "C:/Users/dennis/Desktop/hydrated.jsonl"

# Read user id's if tweets are for training of testing
users=[]
pathUser="C:/Users/dennis/Desktop/dataTestTrain/"+usage+"User.csv"
with open(pathUser,encoding="utf8",errors='ignore') as file:
    for x in file:
        user = x.replace("\n","")
        users.append(user)

# This method will append tweets in the new training/test file
def writeToJson(data):
    DeepGeoHydrated = "C:/Users/dennis/Desktop/dataTestTrain/deepGeoUser."+usage
    with open(DeepGeoHydrated, 'a') as file:
        for line in data:
            json.dump(line,file)
            file.write('\n')
        

# Features which we will use for each tweet
col_1 = ['contributors', 'lang']
col_2 = ['created_at', 'favorited']
col_3 = ['in_reply_to_status_id_str', 'user', 'retweet_count', 'favorite_count']
col_4 = ['retweeted', 'in_reply_to_user_id', 'in_reply_to_screen_name', 'source', 'entities']


validData=[]
with open(pathHydrated,encoding="utf8",errors='ignore') as allTweets:
        part=0
        fullData =[]
        for raw in allTweets:
            fullData=[]
            tweet = json.loads(raw)
            user = tweet["user"]["id_str"]
            
            if user not in users:
                continue
                        
            else:
                geo = tweet["geo"]
                if geo != None:
                    line={}
                    for column in col_1:
                        line[column]=tweet[column]

                    line["text"]=tweet["full_text"]

                    for column in col_2:
                        line[column]=tweet[column]

                    line["filter_level"]="medium"    
                    line["hashed_tweet_id"]=tweet["id_str"]       
                    line['in_reply_to_user_id_str']=tweet['in_reply_to_user_id_str']                                          
                    line['truncated'] = tweet["truncated"]                                          
                    line['in_reply_to_status_id']= None

                    for column in col_3:
                        line[column] = tweet[column]

                    line['hashed_user_id']=tweet["user"]["id_str"]

                    for column in col_4:
                        line[column]=tweet[column]
                    
                    fullData.append(line)
                    validData.append([tweet["id_str"],newUserLabel[user]])
                    writeToJson(fullData)
                    continue
                

                if (tweet["place"]!=None) and geo == None  :
                    if tweet["place"]["bounding_box"] != None:
                                           
                        line={}
                        for column in col_1:
                            line[column]=tweet[column]

                        line["text"]=tweet["full_text"]

                        for column in col_2:
                            line[column]=tweet[column]

                        line["filter_level"]="medium"    
                        line["hashed_tweet_id"]=tweet["id_str"]       
                        line['in_reply_to_user_id_str']=tweet['in_reply_to_user_id_str']                                          
                        line['truncated'] = tweet["truncated"]                                          
                        line['in_reply_to_status_id']= None

                        for column in col_3:
                            line[column] = tweet[column]

                        line['hashed_user_id']=tweet["user"]["id_str"]

                        for column in col_4:
                            line[column]=tweet[column]
                            
                        fullData.append(line)
                        validData.append([tweet["id_str"],newUserLabel[user]])
                        writeToJson(fullData) 

In [None]:
# Put the 3 input files ( training , testing, and groundtruth) in the corresponding path as declared "config.py" and 
# set the mode in "utils.py"  as "train". To train a specfic model set the corresponding parameters in "config.py" and then 
# use command "python geo_train.py". The trained model should be in folder "output", and has to be selected 
# when testing datasets.