# Modeling Twitter Data

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import re
import pickle

from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.utils.multiclass import unique_labels
from sklearn.externals import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, roc_auc_score

## Modeling the Status of the Tweets For 10/02/2018-10/02/2019

In [9]:
# reading in data
last_year = pd.read_csv('../datasets/last_year_closed.csv')
last_year.head()

Unnamed: 0,Tweet,Date,road_closure
0,all clear …,Tue Oct 01 21:05:54 +0000 2019,0
1,join calmentor north region for a networking s...,Tue Oct 01 20:06:52 +0000 2019,0
2,expect delays on northbound i-5 near j street ...,Tue Oct 01 18:36:08 +0000 2019,1
3,on #cleanairdayca give public transportation a...,Tue Oct 01 17:59:04 +0000 2019,0
4,#trafficalert: permit loads will be restricted...,Mon Sep 30 23:29:23 +0000 2019,1


In [10]:
# there were 8 null values for some reason so I dropped them
last_year.dropna(inplace=True)

In [55]:
# train test split on tweets and closed status
X = last_year['Tweet']
y = last_year['road_closure']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [16]:
# using a grid search with our parameters to find the best logistic regression model
# Instantiate SVM.
# grid search function parameters
pipe_params = {
    'cvec__max_features': [100, 250, 500, 1000],
    'cvec__min_df': [2, 3, 4],
    'cvec__max_df': [0.85, 0.90, 0.95], 
    'cvec__ngram_range': [(1, 1), (1, 2), (1,3)]
              }
pipe = Pipeline([
        ('cvec', CountVectorizer()),
        ('lr', LogisticRegression())
    ])

# Instantiate GridSearchCV.
gs = GridSearchCV(pipe,
                  pipe_params, 
                  cv=3) 

In [17]:
# Fit GridSearch to training data.
gs.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [18]:
# What's the best score?
print(gs.best_score_)

0.9322620738636364


In [19]:
# Save best model as gs_model.
gs_model = gs.best_estimator_

In [20]:
# Score model on training set.
gs_model.score(X_train, y_train)

0.9601384943181818

In [21]:
gs_model.score(X_test, y_test)

0.9379494007989347

In [57]:
pred_probs = gs_model.predict_proba(X_test)
predictions = gs_model.predict(X_test)

In [58]:
X_test = pd.DataFrame(X_test)

In [59]:
# adding probabilities to kincade_fire_closed dataframe
probs = []
for prob in pred_probs:
    if prob[0] > prob[1]:
        probs.append(prob[0])
    else:
        probs.append(prob[1])
X_test['closed_probs'] = probs

In [60]:
X_test['closed_open'] = predictions

In [61]:
X_test.head(20)

Unnamed: 0,Tweet,closed_probs,closed_open
849,caltrans is hosting a public open house at our...,0.999603,0
7449,amazon delivers more than bargained for in gle...,0.97983,0
11735,historic photos show what sonoma county looked...,0.980866,0
11649,sonoma valley water users can get smart about ...,0.992612,0
11079,sonoma cops: parallel worlds in police work,0.990167,0
14717,#pilotfire [update] off pilot ridge rd & usfs ...,0.771665,1
10787,wayward duck gets sucked down lake berryessa g...,0.995431,0
12938,hundley harvey and agrimonti come out on top i...,0.995474,0
11299,city eyes $15 minimum wage,0.984381,0
3571,anything that is tossed on the street after cr...,0.920236,0


In [62]:
X_test.to_csv('../datasets/yrloop_closed_probs.csv', index=False)

In [63]:
# checking our baseline score
y.value_counts(normalize=True)

0    0.777482
1    0.222518
Name: road_closure, dtype: float64

## Using Model to Detect Road Closures During the Kincade Fire

In [64]:
# reading in data
kincade_fire = pd.read_csv('../datasets/kincade_fire_all.csv')
kincade_fire.head()

Unnamed: 0,Tweet,User,User_ID,Geo,HashTag,Date
0,Lakeville Hwy remains closed during investigat...,CHPSantaRosa,1902334747,,,Fri Nov 01 23:10:10 +0000 2019
1,Lakeville Hwy is closed between SR37 and SR116...,CHPSantaRosa,1902334747,,,Fri Nov 01 22:21:02 +0000 2019
2,We’re on scene of a fatal traffic collision on...,CHPSantaRosa,1902334747,,,Fri Nov 01 22:01:41 +0000 2019
3,Halloween definitely has a different feel arou...,CHPSantaRosa,1902334747,,,Thu Oct 31 23:47:18 +0000 2019
4,More progress! #kincadefirehttps://twitter.com...,CHPSantaRosa,1902334747,,#kincadefirehttps,Thu Oct 31 21:00:33 +0000 2019


In [65]:
# train test split on tweets and closed status
X = kincade_fire['Tweet']
X_train, X_test = train_test_split(X, random_state=42)

In [66]:
# accuracy score
predictions = gs_model.predict(X_test)
pred_probs = gs_model.predict_proba(X_test)

In [67]:
X_test = pd.DataFrame(X_test)

In [68]:
# adding probabilities to kincade_fire_closed dataframe
probs = []
for prob in pred_probs:
    if prob[0] > prob[1]:
        probs.append(prob[0])
    else:
        probs.append(prob[1])
X_test['closed_probs'] = probs

In [69]:
X_test['closed_open'] = predictions

In [71]:
X_test.head()

Unnamed: 0,Tweet,closed_probs,closed_open
940,No problem!,0.981459,0
297,"Hi there, Séamus. The videos are captioned - c...",0.844956,0
271,Evacuation Orders - https://socoemergency.org/...,0.665353,1
948,Of course!,0.952165,0
1065,"(Update 54, 11:16 am) #KincadeFire Air attack ...",0.803989,0


In [70]:
X_test.to_csv('../datasets/kincade_fire_closed_probs.csv', index=False)

In [72]:
tweets_closed = X_test[X_test['closed_open'] == 1]

In [75]:
tweets_closed.to_csv('../datasets/kincade_fire_closed_only.csv', index=False)