# Cleaning Tweets and Determining Road Closures

In [20]:
#data stuff
import pandas as pd
import numpy as np
import re

#other
from tqdm import tqdm_notebook

## Cleaning tweets for the last week tweets

In [21]:
# loading in data
last_year = pd.read_csv('../datasets/yrloop_all.csv')
last_year.drop(columns=['User', 'User_ID', 'Geo', 'HashTag'], inplace=True)

In [22]:
# clean tweet function
def clean_tweets (df):
    df['Tweet'] = df['Tweet'].str.replace(r'pic.twitter.com.*[\r\n]*', '', regex=True)
    df['Tweet'] = df['Tweet'].map(lambda x: re.sub('http[s]?:\/\/[^\s]*',' ', x))
    df['Tweet'] = df['Tweet'].str.lower()
    df['Tweet'] = df['Tweet'].str.replace('.', '')
    df['Tweet'] = df['Tweet'].str.replace('/', '')
    df['Tweet'] = df['Tweet'].str.replace(',', '')
    df['Tweet'] = df['Tweet'].str.replace("'", '')
    df['Tweet'] = df['Tweet'].str.replace(";", '')
    df['Tweet'] = df['Tweet'].str.replace("(", '')
    df['Tweet'] = df['Tweet'].str.replace(")", '')
    return df


In [23]:
# cleaning the tweets
clean_tweets(last_year)

Unnamed: 0,Tweet,Date
0,all clear …,Tue Oct 01 21:05:54 +0000 2019
1,join calmentor north region for a networking s...,Tue Oct 01 20:06:52 +0000 2019
2,expect delays on northbound i-5 near j street ...,Tue Oct 01 18:36:08 +0000 2019
3,on #cleanairdayca give public transportation a...,Tue Oct 01 17:59:04 +0000 2019
4,#trafficalert: permit loads will be restricted...,Mon Sep 30 23:29:23 +0000 2019
...,...,...
15083,last year nine california first responders tra...,Sat Oct 06 16:00:00 +0000 2018
15084,don’t fall for it! despite recent rains fire d...,Fri Oct 05 18:34:37 +0000 2018
15085,#halloween2018 is less than a month away! but ...,Wed Oct 03 20:23:51 +0000 2018
15086,cal fire is assisting @sierra_nf with a fire o...,Wed Oct 03 01:25:38 +0000 2018


In [32]:
# passing a list of roads and road-closure/open keywords to identify if a tweet is signifying a road closure
road_keywords = ['road', 'st','rd', 'hwy', 'highway', 'ave', 'avenue',
                 'intersection', 'bridge', 'sr-', 'cr-', 'us-', 'i-', 'blvd']

closed_keywords = ['closed', 'remains closed', 'shut down', 'backed up',
                   'no travel', 'delay', 'blocked', 'delays',
                   'disabled', 'traffic', 'fire', 'flood', 'closures', 'closure']

to_drop = ["open", "opened", "lifted", "reopened", "clear", "cleared"]

In [33]:
def tweet_filter (df, col, keywords, roads, dropwords):
    df['road_closure'] = df[col].str.lower().map(lambda x: 1 if ((any(word in x for word in roads)) 
                                                     & (any(word in x for word in keywords)) 
                                                     & (not any(word in x for word in dropwords))) 
                                                     else 0)   
    return df['road_closure']

In [35]:
# run function on full dataframe
last_year['road_closure'] = tweet_filter(last_year, 'Tweet', closed_keywords, road_keywords, to_drop)

In [38]:
# making sure this worked
last_year.head()

Unnamed: 0,Tweet,Date,road_closure
0,all clear …,Tue Oct 01 21:05:54 +0000 2019,0
1,join calmentor north region for a networking s...,Tue Oct 01 20:06:52 +0000 2019,0
2,expect delays on northbound i-5 near j street ...,Tue Oct 01 18:36:08 +0000 2019,1
3,on #cleanairdayca give public transportation a...,Tue Oct 01 17:59:04 +0000 2019,0
4,#trafficalert: permit loads will be restricted...,Mon Sep 30 23:29:23 +0000 2019,1


In [42]:
last_year.to_csv('../datasets/last_year_closed.csv', index=False)