In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import GetOldTweets3 as got
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime

## **Cleaning up csvs**

In our intial scraping, we had an issue with certain rural areas returning **the entire planet's tweets** so we had to remove those from our final dataset, as well as the following steps:
* Remove redundant headers
* Remove rows with no text or date

In [4]:
def combine_and_clean_df(state_abbrev:str, cities_to_remove:list):
    '''
    This function combines and cleans dataframes after they were
    scraped from dirty, dirty Twitter. We need to go through a few
    similar steps based on our collection methods: remove broken
    cities, remove redundant headers, remove rows with no text or date
    
    -----------------------
    Parameters:
    
    'state_abbrev' : str, two-letter state abbreviation at the beginning
    of your auto-generated csv title
    
    'cities_to_remove' : list, A list of cities to remove from your dataset
    '''
    
    # Getting all of a state's filenames
    filenames = [filename for filename in os.listdir('./data') if filename.startswith(state_abbrev.upper())]
    # Create new df so the final df doesn't keep appending if you need to run again
    temp_df = pd.read_csv(f'./data/{filenames[0]}')

    # Write a clean df
    pd.DataFrame(columns=temp_df.columns).to_csv(f'./data/{state_abbrev.upper()}_full.csv', index=False)
    
    # Main cleaning loop
    for filename in filenames:
        df = pd.read_csv(f'./data/{filename}')
        df.drop_duplicates(subset=['username', 'date'], keep='first', inplace=True) # need to remove any potential duplicates from overlapping city areas
        
        # Clean up final df
        df = df[(df['username'] != 'username') & # Removes headers leftover from scraping
               (~df['city'].isin(cities_to_remove))] # Removes the cities that we need to remove
        df.dropna(subset=['text', 'date'], inplace=True) # There were some nulls in the text and date column that are likely the result of deleted/private tweets
        df.to_csv(f'./data/{state_abbrev.upper()}_full.csv', index=False, mode='a', header=False)

## **Cleaning Georgia Datasets**

In [5]:
ga_cities_to_remove = ['herndon', 'sunnyside', 'windsor', 'rowena', 'dublin']

In [6]:
combine_and_clean_df('GA', ga_cities_to_remove)

## **Cleaning Illinois Datasets**

In [7]:
il_cities_to_remove = ['ripley', 'island grove', 'pleasant grove', 'pickneyville']

In [8]:
combine_and_clean_df('IL', il_cities_to_remove)