In [None]:
# Predict geolocation with Carmen
import json
import carmen
import csv
import datetime


# Method returns prediction
def getLocation(data):
    
    tweet = json.loads(data)
    
    # Carmen 
    resolver = carmen.get_resolver()
    resolver.load_locations()
    location = resolver.resolve_tweet(tweet)
    
    # If Carmen can predict -> location == True
    if location:
        return({"id": tweet["id"],
                        "latitude": location[1].latitude,
                        "longitude": location[1].longitude,
                        "country": location[1].country,
                        "state": location[1].state,
                        "county": location[1].county,
                        "city": location[1].city,
                        "known": location[1].known, # True if location appears in Database, false otherwise
                        "location_id": location[1].id}) # Location based solely on Twitter Place Information
    
    
    # If Carmen can't predict, we set all attributes to "None"
    else:
        return({"id": tweet["id"],
                        "latitude": "None",
                        "longitude": "None",
                        "country": "None",
                        "state": "None",
                        "county": "None",
                        "city": "None",
                        "known": "None",
                        "location_id": "None"})
        
# Progress update each 10.000 Tweets + Timestamp
counter=0

# Path Hydrated Tweets
fileHydrated = 'C:/Users/dennis/Desktop/TweetsCOV19ALLIDs.jsonl'
with open(fileHydrated,encoding="utf8",errors='ignore') as hydratedJSON:
    
    # Path to save predictions
    filePredictions = 'C:/Users/dennis/Desktop/Carmen.csv'
    
    with open (filePredictions, 'w', newline='',encoding="utf-8") as csvfile:   
        writer = csv.writer(csvfile, delimiter=';')
        
        for line in hydratedJSON:            
            # Each Tweet will be predicted by its own, due to memory usage
            locatedTweet = getLocation(line)
            
            # Save predictions to File
            writer.writerow(locatedTweet.values())
            counter = counter +1
            
            if(((counter % 10000)==0) or counter == 1):
                x = datetime.datetime.now()
                print(x.strftime("%X")+" "+str(counter))

In [16]:
# Print coverage for Carmen predictions
import pandas as pd

# Path Carmen predictions
pathCarmen='C:/Users/dennis/Desktop/BachelorArbeit/resultsAll/Carmen.csv'

# Read Carmen predictions into dataframe, replace "None" with "null", and put "null" for empty cells
df_carmen = pd.read_csv(pathCarmen, delimiter =';',names=['id','latitude',"longitude","country","state","county","city","known","location_id"])
df_carmen = df_carmen.replace("None", "null")
df_carmen = df_carmen.fillna("null")


# Coverage for the whole dataset
countAll = len(df_carmen)
def countTweets(locationType):
        df_region = df_carmen[df_carmen[locationType]!="null"]
        count=len(df_region)
        return count

countCountries = countTweets("country")
countState = countTweets("state")
countCounty = countTweets("county")
countCity = countTweets("city")

covCountries = round(((countTweets("country")/countAll)*100),2)
covState = round(((countTweets("state")/countAll)*100),2)
covCounty = round(((countTweets("county")/countAll)*100),2)
covCity = round(((countTweets("city")/countAll)*100),2)

print("Country: "+ str(covCountries)+"%, State: "+str(covState)+"%, County: "+str(covCounty)+"%, City: "+str(covCity)+"%")
print("*********************************")


# Coverage for specific countries
list_countries = ["United States","United Kingdom","Germany","France","Spain","India","Italy"]

def countTweets(list_countries,locationType):
    count = {}
    
    # Count tweets on state,county,city level
    if locationType != "country":
        for x in list_countries:
            df_country = df_carmen[df_carmen["country"]==x]
            #Count is equal to rows which are not "null" 
            df_region = df_country[df_country[locationType]!="null"]
            count[x]=len(df_region)
        return count
        
    # Count tweets on country level
    for x in list_countries:
        df_country = df_carmen[df_carmen[locationType]==x]
        count[x]=len(df_country)
    return count

# Calculate coverage for each country in list_countries
countCountries = countTweets(list_countries,"country")
countState = countTweets(list_countries,"state")
countCounty = countTweets(list_countries,"county")
countCity = countTweets(list_countries,"city")

# Print results
for x in list_countries:
    percState = round(((countState[x]/countCountries[x])*100),2)
    percCounty = round(((countCounty[x]/countCountries[x])*100),2)
    percCity = round(((countCity[x]/countCountries[x])*100),2)
    
    print(x+": "+str(countCountries[x])+", State "+str(percState)+"%, County "+str(percCounty)+"%, City "+str(percCity)+"%")

    


Country: 47.73%, State: 36.4%, County: 24.49%, City: 27.26%
United States: 1745400, State 90.18%, County 62.04%, City 61.76%
United Kingdom: 388679, State 78.75%, County 64.06%, City 58.83%
Germany: 16817, State 42.02%, County 39.07%, City 33.86%
France: 17346, State 38.25%, County 37.53%, City 37.52%
Spain: 9876, State 58.75%, County 44.7%, City 41.91%
India: 268334, State 52.85%, County 17.01%, City 44.59%
Italy: 6840, State 39.12%, County 36.84%, City 33.85%
