In [1]:
import copy
import random
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from datetime import datetime as dt
import backend_codes.get_tweets as gt
import backend_codes.tweet_processing as tp

# Proprocessing Twitter Data

# Assigning Neighborhoods via Places
As mentioned in the previous notebook, Twitter no longer uses direct coordinates for attaching tweet locations. It is now more common to get implicit location data e.g your gym, coffee shop, bus stop, neighborhood, city, state, country, etc.
We want to perform an analysis on neighborhood level, so the only interesting place types for us are neighborhoods and POIs. These have to be assigned to the district the are located in.

Furthermore, each tweet should have an entry for each of its coordinates as well as these coordinates in WKT format.

The geodata for Rios neighborhoods, the Barrios, are used from the [Rio open data portal](LINK). These are the geometries that the tweets are going to be matched to.

### Filtering Places
First, we load the places list. The code tweet_processing has the possibility to filter tweets or place lists. We initialize an analyzer object, then we use all common filters and finally we seperate POIs from neighborhoods.

In [2]:
fr = gt.load_tweets(r'data\tweets\retrieved_tweets.txt')

In [3]:
places = fr['includes']['places']

place_analyser = tp.PlaceAnalyser(places)
places = place_analyser.use_filters(neighborhoods=True, pois=True)
neigh = place_analyser.neighborhoods
pois = place_analyser.pois

Removed 1 Places
0.0025760581158635887 % of the places were located in a city or bigger region


Removed 801 places
97.93652429285383 % of the places were POIs 


Removed 38017 places
2.0634757071461696 % of the places were Neighborhoods 




### Omitting Neighborhood Tweets
Since the are multiple issues with matching tweets that only have a neighborhood tag to the actual neighborhood, we decided to omit these tweets entirely. Some of the 163 neighborhoods for example do not exist on twitter. Others exist but have different geomtries that on the official [open data portal](https://www.data.rio/). Since we could not develop a coherent method, to overcome these problems, we decided to just use the POI tweets.

In [4]:
barrios = gpd.read_file('data/shps/neighborhoods.shp')

### Matching the Pois to the open data neighborhoods
For the pois, twitter just acts like its giving a bounding box, in reality it is just a point with its coordinates doubled:
- 'bbox': [-43.39237500933108,
  -22.97877033005603,
  -43.39237500933108,
  -22.97877033005603]
  
We create a Point from these coordinates and the do a point in polygon test for each barrio. If we find a truth value, we put that into our list, otherwise we put None

In [5]:
for i, n in enumerate(pois):
    coords = n['geo']['bbox'][:2]
    p = Point(coords[0], coords[1])
    
    cod = None
    for j, poly in enumerate(barrios.geometry.values):
        if p.within(poly):
            cod = barrios.loc[j].CODBAIRRO
            break
    
    pois[i]['cod'] = cod
    
    p = Point(pois[i]['geo']['bbox'][0], pois[i]['geo']['bbox'][1])

    pois[i]['wkt'] = p
    pois[i]['lat'] = p.y
    pois[i]['lon'] = p.x

The places have to be combined and are then associated with their place_id for matching later. So for each place id as key, there is the place data and the respective cod (the neighborhoods' ID) in the dictionary as value. Many places do not have a cod, since they are outside of rios' boundaries, but inside its bounding box.

In [6]:
places = pois

In [7]:
d = {}
for i, place in enumerate(places):
        
    data = None
    
    # lets also attach the geoinformation
    if place['place_type'] == 'poi':
        data = [place['wkt'], place['lat'], place['lon']]
        
    if place['place_type'] == 'neighborhood' and place['cod'] is not None:
        data = barrios.loc[barrios.CODBAIRRO == place['cod']][['wkt', 'lat', 'lon']].values.tolist()[0]
        
    d[place['id']] = (place['cod'], data)

### Adding Barrios to the Tweets
Now we have cods for all place-ids that we want, we will now add it to the tweets.

All Tweets should have such a place-id attached. We check here for potential mistakes.

In [8]:
tweets = copy.deepcopy(fr['data'])
print('Tweets raw: ', len(tweets))
tweets = tp.TweetAnalyzer(tweets).use_filters()
print('Tweets after first filters: ', len(tweets))

Tweets raw:  699524
Deleted 0 Tweets
100.0 % were not retweets


Removed 3 Tweets
99.99957113694455 % of the Tweets have a location attached


Removed 1824 Tweets
0.26074985597287537 % of the Tweets were coordinates


Removed 7 Tweets
0.0010033008598213655 % of the Tweets were located in a city or bigger region


Removed 1455 Tweets
99.79145465751265 % of the Tweets have a place type attached


Tweets after first filters:  696235


There are no retweets in the dataset

3 tweets have no location attached (likely bug)

1824 are coordinates, these are not useful anymore

7 tweets were falsely returned with a place_type city or even bigger (likely bug)

1455 tweets did not even have a place type (likely bug)

In [9]:
def add_cods(tweets):
    for i, tweet in enumerate(tweets):
        try:
            cod = d[tweet['geo']['id']][0]
            data = d[tweet['geo']['id']][1]
        except:
            data = None
            cod = '0'
        
        if 'place_type' not in tweet['geo']:
            print('NO PLACE TYPE')
            tweets[i]['geo']['place_type'] = 'coordinate'
        
        tweets[i]['cod'] = cod
    
        if data is not None:
            tweets[i]['wkt'] = data[0]
            tweets[i]['lat'] = data[1]
            tweets[i]['lon'] = data[2]
            
        else:
            tweets[i]['wkt'] = None
            tweets[i]['lat'] = None
            tweets[i]['lon'] = None
            
    return tweets

tweets = add_cods(tweets)

Filter Tweets with only valid CODs. Here we remove tweets that originate from outside the cities boundaries. Only the tweets with a valid COD are retained.

In [10]:
tweets_filtered = []
for i in range(len(tweets)-1, -1, -1):
    if tweets[i]['cod'] != '0' and tweets[i]['cod'] is not None:
        tweets_filtered.append(tweets[i])
        
print(f'{len(tweets_filtered)} of originally {len(tweets)} could be assigned to a Barrio ({round(len(tweets_filtered) / len(tweets)*100, 1)} %)\n')

print('Tweets after valid cod filter: ', len(tweets_filtered))

only_pois = tp.TweetAnalyzer(tweets_filtered).only_pois()

print('Tweets after poi filter: ', len(only_pois))

420518 of originally 696235 could be assigned to a Barrio (60.4 %)

Tweets after valid cod filter:  420518
Removed 0 Tweets
100.0 % of the Tweets were POIs 
Tweets after poi filter:  420518


117488 could not be assigned to a barrio.

In [11]:
drops = ['lang', 'public_metrics', 'conversation_id', 'text', 'referenced_tweets', 'geo', 'withheld']

### Only use POIs
Our current work focusses on POIs only. A dataframe is created where a unnecessary information is removed and the columns are renamed.

In [12]:
pois_df = pd.DataFrame(only_pois)
pois_df = pois_df.drop(columns=drops)
pois_df['created_at'] = pois_df['created_at'].apply(lambda x: dt.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z"))
pois_df['author_id'] = pois_df['author_id'].apply(lambda x: x['id'])
pois_df = pois_df.rename(columns = {'author_id': 'User_ID', 'id': 'Tweet_ID', 'geo': 'Place_ID', 'created_at': 'Timestamp'})
pois_df.cod = pois_df.cod.astype(int)

# Bot Filtering
For bot filtering we use a threshold to remove all users that send either more that 50 tweets a day of that are responsible for more that 1 % of total tweet output.

In [13]:
grouped = copy.deepcopy(pois_df)

grouped['day'] = grouped.Timestamp.apply(dt.date)

# Mean Tweets per day per User larger than 50
tpdpu = (grouped.groupby('User_ID').size() / (grouped.day.max() - grouped.day.min()).days)
bot_users1 = tpdpu[tpdpu > 50]

# More than 1 % of Tweets
ppu = (grouped.groupby('User_ID').size() / len(grouped))
bot_users2 = ppu[ppu > 0.01]

bots = bot_users1 + bot_users2

filtered = grouped.loc[~grouped.User_ID.isin(bots.index)]
filtered = filtered[pois_df.columns]

In [14]:
filtered.to_csv('data/tweets/preprocessed_tweets_with_poi_location.csv', index=False)

### Show short summary of out Twitter Data

In [15]:
print("Our Twitter POIs are {} Tweets".format(len(pois_df)))
print("Number of unique users: {}".format(pois_df.User_ID.nunique()))
print("First POI-Tweet: {}".format(pois_df.Timestamp.min()))
print("Last POI-Tweet: {}".format(pois_df.Timestamp.max()))

Our Twitter POIs are 420518 Tweets
Number of unique users: 107500
First POI-Tweet: 2020-04-06 00:03:33
Last POI-Tweet: 2022-08-31 23:58:25


### Add tweet counts to neighborhoods

In [16]:
tweets = gpd.GeoDataFrame(filtered, geometry=filtered.wkt)

In [17]:
def _count(x):
    clip = gpd.clip(tweets, x)
    return len(clip)

barrios['counts'] = barrios['geometry'].apply(_count)
barrios['counts_per_pop'] = barrios['counts'] / barrios['popsize']

In [20]:
%%capture
barrios.to_file('data/shps/neighborhoods.shp')