## Doing some extra cleaning to the tweets

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
stop_words.add('also')

parking = pd.read_csv('parking_final.csv')
parkinglot = pd.read_csv('parkinglot_final.csv')
parkinggarage = pd.read_csv('parkinggarage_final.csv')

# Lower-case
for col in ['text', 'tweet_hashtags', 'location']:
    parking[col] = parking[col].apply(lambda x: x.lower() if type(x) == str else x)
    parkinglot[col] = parkinglot[col].apply(lambda x: x.lower() if type(x) == str else x)
    parkinggarage[col] = parkinggarage[col].apply(lambda x: x.lower() if type(x) == str else x)
    

In [4]:
def remove_stop_words(text):
    filtered_text = []
    for t in text:
        if type(t) == str:
            text_tokens = word_tokenize(t)

            tokens_without_sw = [word for word in text_tokens if not word in stop_words]

            filtered_sentence = (" ").join(tokens_without_sw)
            filtered_text.append(filtered_sentence)
        else:
            filtered_text.append(t)
    return filtered_text

In [5]:
parking['text'] = remove_stop_words(parking.text)
parkinglot['text'] = remove_stop_words(parkinglot.text)
parkinggarage['text'] = remove_stop_words(parkinggarage.text)

In [6]:
def remove_stopwords_from_hashtags(hashtags):
    filtered_hashtags = []
    for ht in hashtags:
        if type(ht) == str:
            text_tokens = ht.split(' ')
            tokens_without_hash = [x.lstrip('#') for x in text_tokens]
        
            tokens_without_sw = [word for word in tokens_without_hash if not word in stop_words]
            added_hashtags = [''.join(['#', x]) for x in tokens_without_sw]
            
            filtered_sentence = (' ').join(added_hashtags)
            filtered_hashtags.append(filtered_sentence)
        else:
            filtered_hashtags.append(ht)
    return filtered_hashtags

In [7]:
parking['tweet_hashtags'] = remove_stopwords_from_hashtags(parking.tweet_hashtags)
parkinglot['tweet_hashtags'] = remove_stopwords_from_hashtags(parkinglot.tweet_hashtags)
parkinggarage['tweet_hashtags'] = remove_stopwords_from_hashtags(parkinggarage.tweet_hashtags)

In [8]:
def remove_punctuation(text):
    filtered_text = []
    for t in text:
        if type(t) == str:
            text_parts = list(t)

            parts_without_punc = [part for part in text_parts if not part in string.punctuation]

            filtered_sentence = ('').join(parts_without_punc)
            filtered_text.append(filtered_sentence)
        else:
            filtered_text.append(t)
    return filtered_text

In [9]:
parking['text'] = remove_punctuation(parking.text)
parkinglot['text'] = remove_punctuation(parkinglot.text)
parkinggarage['text'] = remove_punctuation(parkinggarage.text)

In [11]:
# Some extra removals
remove_these = ['\x99', '\x98', '\x97', '\x96', '\x95', '\x94', '\x93', '\x92', '\x91', '·']
remove_dict = {ord(rm): '' for rm in remove_these}

parking['text'] = parking['text'].apply(lambda x: x.translate(remove_dict) if type(x) == str else x)
parkinglot['text'] = parkinglot['text'].apply(lambda x: x.translate(remove_dict) if type(x) == str else x)
parkinggarage['text'] = parkinggarage['text'].apply(lambda x: x.translate(remove_dict) if type(x) == str else x)

In [13]:
#parking.to_csv('parking_final_cleaned.csv', index=False)
#parkinglot.to_csv('parkinglot_final_cleaned.csv', index=False)
#parkinggarage.to_csv('parkinggarage_final_cleaned.csv', index=False)

## Combining the datasets and limiting the amount of data

In [None]:
combined = pd.concat([parking, parkinglot, parkinggarage])
combined['time'] = pd.to_datetime(combined['time'])
combined = combined.reset_index()
combined.drop(columns=['index'], inplace=True)
combined.drop_duplicates(['time', 'user_id', 'text'], inplace=True) # Removing duplicate tweets
combined = combined[(combined['time'] >= pd.to_datetime('2020-01-01'))] # Taking only the data from this year

In [None]:
combined.to_csv('all_hashtags_combined.csv',index=False)

In [None]:
# Also limiting the data for each hashtag data file 
parking['time'] = pd.to_datetime(parking['time'])
parkinglot['time'] = pd.to_datetime(parkinglot['time'])
parkinggarage['time'] = pd.to_datetime(parkinggarage['time'])

parking = parking[(parking['time'] >= pd.to_datetime('2020-01-01'))]
parkinglot = parkinglot[(parkinglot['time'] >= pd.to_datetime('2020-01-01'))]
parkinggarage = parkinggarage[(parkinggarage['time'] >= pd.to_datetime('2020-01-01'))]

# Saving the hashtag files
parking.to_csv('parking_from_january.csv', index = False)
parkinglot.to_csv('parkinglot_from_january.csv', index = False)
parkinggarage.to_csv('parkinggarage_from_january.csv', index = False)