# Cleaning Florida tweets csv 

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import GetOldTweets3 as got
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime

## **Cleaning up csvs**

In our intial scraping, we had an issue with certain rural areas returning **the entire planet's tweets** so we had to remove those from our final dataset, as well as the following steps:
* Remove redundant headers
* Remove rows with no text or date

In [2]:
def combine_and_clean_shutdown_df(state_abbrev:str, cities_to_remove:list):
    '''
    This function combines and cleans dataframes after they were
    scraped from dirty, dirty Twitter. We need to go through a few
    similar steps based on our collection methods: remove broken
    cities, remove redundant headers, remove rows with no text or date
    
    -----------------------
    Parameters:
    
    'state_abbrev' : str, two-letter state abbreviation at the beginning
    of your auto-generated csv title
    
    'cities_to_remove' : list, A list of cities to remove from your dataset
    '''
    
    # Getting all of a state's filenames
    filenames = [filename for filename in os.listdir('./data/shutdown_data/') if filename.startswith(state_abbrev.upper())]
    # Create new df so the final df doesn't keep appending if you need to run again
    temp_df = pd.read_csv(f'./data/shutdown_data/{filenames[0]}')

    # Write a clean df
    df = pd.DataFrame(columns=temp_df.columns).to_csv(f'./data/shutdown_data/{state_abbrev.upper()}_full_ignore.csv', index=False)
    
    # Main cleaning loop
    for filename in filenames:
        df = pd.read_csv(f'./data/shutdown_data/{filename}')
        df.drop_duplicates(subset=['username', 'date'], keep='first', inplace=True) # need to remove any potential duplicates from overlapping city areas
        
        # Clean up final df
        df = df[(df['username'] != 'username') & # Removes headers leftover from scraping
               (~df['city'].isin(cities_to_remove))] # Removes the cities that we need to remove
        df.dropna(subset=['text', 'date'], inplace=True) # There were some nulls in the text and date column that are likely the result of deleted/private tweets
        df.to_csv(f'./data/shutdown_data/{state_abbrev.upper()}_full_ignore.csv', index=False, mode='a', header=False)

In [3]:
def combine_and_clean_reopening_df(state_abbrev:str, cities_to_remove:list):
    '''
    This function combines and cleans dataframes after they were
    scraped from dirty, dirty Twitter. We need to go through a few
    similar steps based on our collection methods: remove broken
    cities, remove redundant headers, remove rows with no text or date
    
    -----------------------
    Parameters:
    
    'state_abbrev' : str, two-letter state abbreviation at the beginning
    of your auto-generated csv title
    
    'cities_to_remove' : list, A list of cities to remove from your dataset
    '''
    
    # Getting all of a state's filenames
    filenames = [filename for filename in os.listdir('./data/reopening_data/') if filename.startswith(state_abbrev.upper())]
    # Create new df so the final df doesn't keep appending if you need to run again
    temp_df = pd.read_csv(f'./data/reopening_data/{filenames[0]}')

    # Write a clean df
    df = pd.DataFrame(columns=temp_df.columns).to_csv(f'./data/reopening_data/{state_abbrev.upper()}_full.csv', index=False)
    
    # Main cleaning loop
    for filename in filenames:
        df = pd.read_csv(f'./data/reopening_data/{filename}')
        df.drop_duplicates(subset=['username', 'date'], keep='first', inplace=True) # need to remove any potential duplicates from overlapping city areas
        
        # Clean up final df
        df = df[(df['username'] != 'username') & # Removes headers leftover from scraping
               (~df['city'].isin(cities_to_remove))] # Removes the cities that we need to remove
        df.dropna(subset=['text', 'date'], inplace=True) # There were some nulls in the text and date column that are likely the result of deleted/private tweets
        df.to_csv(f'./data/reopening_data/{state_abbrev.upper()}_full.csv', index=False, mode='a', header=False)

# Cleaning Shutdown Florida Dataset

In [4]:
fixing_df = pd.read_csv('./data/FL_full_shutdown.csv')
fixing_df.head(1)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,hashtags,mentions,geo,urls,city,query,date_range,state,month,day
0,Venom_PT,,BLACK PEOPLE WAKE UP!!! in my Mookie voice fro...,0,0,0,1247675460638605314,https://twitter.com/Venom_PT/status/1247675460...,36584184,2020-04-07 23:59:45+00:00,...,,,,https://www.instagram.com/p/B-su-E6pEI7AWoUKVP...,miami,get OR one OR time OR people OR day OR know OR...,"('2020-03-25', '2020-04-08')",FL,4,7


In [5]:
fixing_df.to_csv('./data/shutdown_data/FL_full.csv', index=False)

In [6]:
# I don't have any cities to remove but still need to complete these steps 
fl_cities_to_remove = []

In [7]:
combine_and_clean_shutdown_df('FL', fl_cities_to_remove)

In [8]:
df = pd.read_csv('./data/shutdown_data/FL_full.csv')
df.head(1) # check to see if names of columns shifted 

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,hashtags,mentions,geo,urls,city,query,date_range,state,month,day
0,Venom_PT,,BLACK PEOPLE WAKE UP!!! in my Mookie voice fro...,0,0,0,1247675460638605314,https://twitter.com/Venom_PT/status/1247675460...,36584184,2020-04-07 23:59:45+00:00,...,,,,https://www.instagram.com/p/B-su-E6pEI7AWoUKVP...,miami,get OR one OR time OR people OR day OR know OR...,"('2020-03-25', '2020-04-08')",FL,4,7


In [9]:
df.dropna(subset=['text'], inplace=True)

In [10]:
df['datetime'] = pd.to_datetime(df['date'], errors='coerce', utc=True)

In [11]:
df['datetime']

0       2020-04-07 23:59:45+00:00
1       2020-04-07 23:58:54+00:00
2       2020-04-07 23:58:52+00:00
3       2020-04-07 23:58:50+00:00
4       2020-04-07 23:58:01+00:00
                   ...           
81103   2020-04-04 19:31:16+00:00
81104   2020-04-04 19:31:16+00:00
81105   2020-04-04 19:31:16+00:00
81106   2020-04-04 19:31:15+00:00
81107   2020-04-04 19:31:06+00:00
Name: datetime, Length: 81093, dtype: datetime64[ns, UTC]

In [12]:
df['month_day'] = df['datetime'].dt.strftime('%m%d')

In [13]:
df.dtypes

username                       object
to                             object
text                           object
retweets                       object
favorites                      object
replies                        object
id                             object
permalink                      object
author_id                      object
date                           object
formatted_date                 object
hashtags                       object
mentions                       object
geo                            object
urls                           object
city                           object
query                          object
date_range                     object
state                          object
month                          object
day                            object
datetime          datetime64[ns, UTC]
month_day                      object
dtype: object

In [14]:
df.head(1)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,geo,urls,city,query,date_range,state,month,day,datetime,month_day
0,Venom_PT,,BLACK PEOPLE WAKE UP!!! in my Mookie voice fro...,0,0,0,1247675460638605314,https://twitter.com/Venom_PT/status/1247675460...,36584184,2020-04-07 23:59:45+00:00,...,,https://www.instagram.com/p/B-su-E6pEI7AWoUKVP...,miami,get OR one OR time OR people OR day OR know OR...,"('2020-03-25', '2020-04-08')",FL,4,7,2020-04-07 23:59:45+00:00,407


In [15]:
df.to_csv('./data/shutdown_data/FL_full.csv', index=False)

# Cleaning Reopening Florida Dataset

In [16]:
fixing_df = pd.read_csv('./data/FL_full_reopening copy.csv')
fixing_df.head(1)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,hashtags,mentions,geo,urls,city,query,date_range,state,month,day
0,NeverSweatHoes,,No person is your friend who demands your sile...,0,1,0,1257822343180795904,https://twitter.com/NeverSweatHoes/status/1257...,1028834407954767873,2020-05-05 23:59:50+00:00,...,,,,,miami,get OR one OR time OR people OR day OR know OR...,"('2020-04-22', '2020-05-06')",FL,5,5


In [17]:
fixing_df.to_csv('./data/reopening_data/FL_full.csv', index=False)

In [18]:
# I don't have any cities to remove but still need to complete these steps 
fl_cities_to_remove = []

In [19]:
combine_and_clean_reopening_df('FL', fl_cities_to_remove)

In [20]:
df = pd.read_csv('./data/reopening_data/FL_full.csv')
df.head(3) # check to see if names of columns shifted 

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,hashtags,mentions,geo,urls,city,query,date_range,state,month,day
0,NeverSweatHoes,,No person is your friend who demands your sile...,0,1,0,1257822343180795904,https://twitter.com/NeverSweatHoes/status/1257...,1028834407954767873,2020-05-05 23:59:50+00:00,...,,,,,miami,get OR one OR time OR people OR day OR know OR...,"('2020-04-22', '2020-05-06')",FL,5,5
1,thepinkkush,,Damn they really been on a hate white ppl trai...,0,0,0,1257822176708898819,https://twitter.com/thepinkkush/status/1257822...,855427481200996352,2020-05-05 23:59:10+00:00,...,,,,,miami,get OR one OR time OR people OR day OR know OR...,"('2020-04-22', '2020-05-06')",FL,5,5
2,_____TeReSa___,hartluck,Take me!!! I need that ride right now,0,0,0,1257822159898054656,https://twitter.com/_____TeReSa___/status/1257...,369885976,2020-05-05 23:59:06+00:00,...,,,,,miami,get OR one OR time OR people OR day OR know OR...,"('2020-04-22', '2020-05-06')",FL,5,5


In [21]:
# Double checking nulls in this one 
df.isna().sum()

username              0
to                37713
text                  0
retweets              0
favorites             0
replies               0
id                    0
permalink             0
author_id             0
date                  0
formatted_date        0
hashtags          58443
mentions          60643
geo               69410
urls              50440
city                  0
query                 0
date_range            0
state                 0
month                 0
day                   0
dtype: int64

In [22]:
df['datetime'] = pd.to_datetime(df['date'], utc=True)

In [23]:
df['datetime']

0       2020-05-05 23:59:50+00:00
1       2020-05-05 23:59:10+00:00
2       2020-05-05 23:59:06+00:00
3       2020-05-05 23:59:02+00:00
4       2020-05-05 23:58:57+00:00
                   ...           
69405   2020-05-01 17:21:35+00:00
69406   2020-05-01 17:21:19+00:00
69407   2020-05-01 17:21:14+00:00
69408   2020-05-01 17:21:13+00:00
69409   2020-05-01 17:21:10+00:00
Name: datetime, Length: 69410, dtype: datetime64[ns, UTC]

In [24]:
df['month_day'] = df['datetime'].dt.strftime('%m%d')

In [25]:
df.head(3)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,geo,urls,city,query,date_range,state,month,day,datetime,month_day
0,NeverSweatHoes,,No person is your friend who demands your sile...,0,1,0,1257822343180795904,https://twitter.com/NeverSweatHoes/status/1257...,1028834407954767873,2020-05-05 23:59:50+00:00,...,,,miami,get OR one OR time OR people OR day OR know OR...,"('2020-04-22', '2020-05-06')",FL,5,5,2020-05-05 23:59:50+00:00,505
1,thepinkkush,,Damn they really been on a hate white ppl trai...,0,0,0,1257822176708898819,https://twitter.com/thepinkkush/status/1257822...,855427481200996352,2020-05-05 23:59:10+00:00,...,,,miami,get OR one OR time OR people OR day OR know OR...,"('2020-04-22', '2020-05-06')",FL,5,5,2020-05-05 23:59:10+00:00,505
2,_____TeReSa___,hartluck,Take me!!! I need that ride right now,0,0,0,1257822159898054656,https://twitter.com/_____TeReSa___/status/1257...,369885976,2020-05-05 23:59:06+00:00,...,,,miami,get OR one OR time OR people OR day OR know OR...,"('2020-04-22', '2020-05-06')",FL,5,5,2020-05-05 23:59:06+00:00,505


In [26]:
df.to_csv('./data/reopening_data/FL_full.csv', index=False)