In [None]:
import pandas as pd
import numpy as np
import re
from pprint import pprint
from datetime import datetime

#scraping
import GetOldTweets3 as got
import time

#plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Get Twitter data through web-scraping

In [None]:
def get_tweets(text_query, start_date, end_date, lang, location, within):
   
    # specifying tweet search criteria 
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(text_query)\
                          .setSince(start_date)\
                          .setUntil(end_date)\
                          .setLang(lang)\
                          .setNear(location)\
                          .setWithin(within)
    
    # scraping tweets based on criteria
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    
    # creating list of tweets with the tweet attributes 
    # specified in the list comprehension
    text_tweets = [[tw.username,
                tw.text,
                tw.date,
                tw.retweets,
                tw.favorites,
                tw.hashtags] for tw in tweet]
    
    # creating dataframe, assigning column names to list of
    # tweets corresponding to tweet attributes
    tw_df = pd.DataFrame(text_tweets, 
                         columns = ['User', 'Text', 'Date', 'Retweets', 'Favorites', 'HashTags'])
    
    return tw_df

We selected 10 cities which have large population and the number of confirmed cases of their county is large.  
This is based on data from JHU on 2020-04-13.
  
Selected cities:  
New York City, New York  (New York county 103208 1st)  
Boston, Massachusetts  (Suffolk county 20934 20th)    
Chicago, Illinois  (Cook county 14585 5th)  
Detroit, Michigan  (Wayne county 11164 6th)  
Los Angeles, California  (Los Angeles county 8894 8th)  
Houston, Texas  (Harris county 3747 26th)  
Newark, New Jersey  (Essex county 7410 11th)  
Miami, Florida  (Miami-dade county 7058 12th)  
Philadelphia, Pennsylvania  (Philadelphia county 6386 13th)  
New Orleans, Louisiana	(Orleans county 5600 17th)  
  
Reference for population: http://www.citymayors.com/gratis/uscities_100.html    

In [None]:
# set attributes needed for get_tweets
text_query = '#StayHome'
start_date = '2020-03-05'
end_date = '2020-04-12'
lang = 'en'
within = '50mi'

citys = ['New York City, New York','Boston, Massachusetts','Chicago, Illinois','Detroit, Michigan','Los Angeles, California','Houston, Texas','Newark, New Jersey','Miami, Florida','Philadelphia, Pennsylvania','New Orleans, Louisiana']

In [None]:
# scrap needed info and export to csv file
for city in citys:
    location = city
    df = get_tweets(text_query, start_date, end_date, lang, location, within)
    df.to_csv(city + '.csv', index = False)
    time.sleep(120)

In [None]:
# add 'City' column and combine all datasets
city_column = ['NY', 'BOSTON', 'CHI', 'DETROIT', 'LA', 'HOUSTON', 'NEWARK', 'MIA', 'PHIL', 'NEW ORLEANS']
stayhome = pd.DataFrame()
for i in range(len(city_column)):
    df = pd.read_csv(citys[i] + '.csv')
    df['City'] = city_column[i]
    stayhome = pd.concat([stayhome, df], axis = 0)

In [None]:
# export our raw data
stayhome.to_csv('StayHome.csv', index = False)

# Sampling raw data

In [None]:
# import data
df = pd.read_csv("StayHome.csv")
df.shape

In [None]:
# remove duplicates
df.replace('[]', np.nan,inplace=True)
df.drop_duplicates(inplace =True)
df.drop_duplicates(subset = ['Text'],inplace =True)

# devide timestamp into date and time
df['Date'] = pd.to_datetime(df['Date'])
df['date'] = df['Date'].apply( lambda x: x.strftime("%Y-%m-%d"))
df['time'] = df['Date'].apply( lambda x: x.strftime("%H-%M-%S"))
df.drop(['Date'],axis = 1, inplace =True)

# replace city initials with city names
replace_values = {'NY' : 'New York', 'BOSTON' : 'Boston', 'CHI' : 'Chicago', 'DETROIT' : 'Detroit', 'HOUSTON' : 'Houston',
                  'LA' : 'Los Angeles', 'MIA' : 'Miami', 'NEWARK' : 'Newark', 'NEW ORLEANS' : 'New Orleans', 'PHIL' : 'Philadelphia'}                                                                                          
df = df.replace({"City": replace_values})  

In [None]:
# calculate users per day
user1 = pd.DataFrame(df.groupby('date')['User'].nunique())
user2 = pd.DataFrame(df.groupby('date')['Text'].nunique())
user = pd.concat([user1,user2],axis =1)
user['num_of_text_per_capita'] = user['Text']/user['User']
user['total_unique_user'] = unique_user
user['lag_user'] = lag_user
user['new_user'] = user['total_unique_user']-user['lag_user']
user.drop(['lag_user'],axis = 1, inplace = True)
user.tail(10)

Raw data is imbalanced in user number.

In [None]:
# count number of unique users for different time period
dates = sorted(df['date'].unique())
unique_user = list()
for date in dates: 
    unique_user.append(df[df['date'] < date]['User'].nunique())

lag_user = [0]+unique_user 
lag_user = lag_user[:-1]

In [None]:
lag_user

In [None]:
# select '2020-04-06' as division
# old users: who started to be active before 04-06
# new_users: who weren't active until 04-06
existing_user= df[df['date'] < '2020-04-06']['User'].unique()

# retain all old users
df1 = df[df['User'].isin(existing_user)]                      
df2 = df[-df['User'].isin(existing_user)]

# sample new users
df2 = df2.sample(1300)

# get new sample
df = pd.concat([df1,df2])                                     

# export new sample
df.to_csv("StayHome_final.csv", index = False)
df.shape

In [None]:
# plot the distribution of our new sample
fig = plt.figure(figsize = (12,5))
chart = sns.countplot(df['date'], order = sorted(df['date'].unique()))
chart.set_xticklabels(chart.get_xticklabels(), rotation = 45)

In [None]:
# calculate users per day
user1 = pd.DataFrame(df.groupby('date')['User'].nunique())
user2 = pd.DataFrame(df.groupby('date')['Text'].nunique())
user = pd.concat([user1,user2],axis =1)
user['num_of_text_per_capita'] = user['Text']/user['User']
user['total_unique_user'] = unique_user
user['lag_user'] = lag_user
user['new_user'] = user['total_unique_user']-user['lag_user']
user.drop(['lag_user'],axis =1, inplace = True)
user.tail(10)

New sample is balanced in user number.