In [1]:
import pandas as pd
import csv
from IPython.display import clear_output

import unicodedata
import re
import nltk
import typing_extensions
import emoji

from nltk.corpus import stopwords, wordnet
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nicho\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nicho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df_1 = pd.read_csv('df_full_sorted.csv')

In [3]:
df_1.head()

Unnamed: 0,UserID,Timestamp,Latitude,Longitude,TweetText,Closest_State,Closest_City
0,USER_28e6d0a1,2010-03-04T02:50:29,33.580862,-86.956455,@USER_148a266e I gotta get you re-added to BBM!,Alabama,Adamsville
1,USER_28e6d0a1,2010-03-04T04:33:59,33.580862,-86.956455,@USER_5c07acb0 @USER_9334f9b7 ahhh yes!,Alabama,Adamsville
2,USER_28e6d0a1,2010-03-04T04:45:31,33.580862,-86.956455,@USER_2594d45f An old locksmith...,Alabama,Adamsville
3,USER_28e6d0a1,2010-03-05T00:44:08,33.580862,-86.956455,RT @USER_f1966b04: They are shooting at pentag...,Alabama,Adamsville
4,USER_28e6d0a1,2010-03-05T00:45:50,33.580862,-86.956455,@USER_80024f73 as a matter of fact... I wanna ...,Alabama,Adamsville


In [4]:
df2 = df_1.drop_duplicates(subset = ['TweetText'])

In [5]:
df2 = df2.drop(['UserID','Latitude','Longitude'], axis = 1)

In [6]:
df2.head()

Unnamed: 0,Timestamp,TweetText,Closest_State,Closest_City
0,2010-03-04T02:50:29,@USER_148a266e I gotta get you re-added to BBM!,Alabama,Adamsville
1,2010-03-04T04:33:59,@USER_5c07acb0 @USER_9334f9b7 ahhh yes!,Alabama,Adamsville
2,2010-03-04T04:45:31,@USER_2594d45f An old locksmith...,Alabama,Adamsville
3,2010-03-05T00:44:08,RT @USER_f1966b04: They are shooting at pentag...,Alabama,Adamsville
4,2010-03-05T00:45:50,@USER_80024f73 as a matter of fact... I wanna ...,Alabama,Adamsville


In [7]:
# Function for synonym replacement using NLTK
def synonym_replacement(text):
    words = nltk.word_tokenize(text)  # Tokenize the text
    new_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:  # If synonyms exist
            # Get the first synonym's lemma
            synonym = synonyms[0].lemmas()[0].name()
            new_words.append(synonym.replace('_', ' '))  # Replace underscores in multi-word synonyms
        else:
            new_words.append(word)  # Keep the original word if no synonym found
    return ' '.join(new_words)

In [8]:
def remove_accents(text):
    # Normalize the text to decompose accented characters into their base forms
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

def remove_non_ascii(text):
    # Keep only ASCII characters
    return ''.join([char for char in text if ord(char) < 128])

def convert_emojis_to_text(text):
    # Convert emojis to their text representation
    return emoji.demojize(text)

def remove_special_characters(text):
    # Remove any special characters that are not letters, numbers, or spaces
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

def clean_tweet_text(text):
    # Convert to string to handle non-string values
    text = str(text)
    text = re.sub(r'http\S+|www.\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)              # remove mentions
    #text = re.sub(r'#\w+', '', text)              # remove hashtags
    text = remove_accents(text)                   # Normalize accented characters
    text = remove_non_ascii(text)                 # Remove non-ASCII characters
    text = remove_special_characters(text)        # Remove special characters
    text = convert_emojis_to_text(text)          # Convert emojis to text
    return text.lower()                            # Lowercase text


In [9]:
# Apply the cleaning function to the TweetText column
df2['TweetText'] = df2['TweetText'].apply(clean_tweet_text)

# Apply text augmentation
#df2['TweetText'] = df2['TweetText'].apply(synonym_replacement)

# Remove rows with NaN values in 'TweetText'
df4 = df2.dropna(subset=['TweetText']).reset_index()

In [10]:
df4.head()

Unnamed: 0,index,Timestamp,TweetText,Closest_State,Closest_City
0,0,2010-03-04T02:50:29,i gotta get you readded to bbm,Alabama,Adamsville
1,1,2010-03-04T04:33:59,ahhh yes,Alabama,Adamsville
2,2,2010-03-04T04:45:31,an old locksmith,Alabama,Adamsville
3,3,2010-03-05T00:44:08,rt they are shooting at pentagon metro please...,Alabama,Adamsville
4,4,2010-03-05T00:45:50,as a matter of fact i wanna ask about that,Alabama,Adamsville


In [11]:
West_Coast = ('California','Oregon', 'Washington')
Southwest = ('Arizona','New Mexico','Oklahoma','Texas')
Rockies = ('Nevada','Utah','Colorado','Wyoming','Idaho','Montana')
Midwest = ('North Dakota','South Dakota','Nebraska','Kansas','Missouri','Iowa','Minnesota','Wisconsin','Illinois',
          'Indiana','Michigan','Ohio')
South = ('Arkansas','Louisiana','Mississippi','Tennessee','Kentucky','Alabama','Georgia','Florida','South Carolina','North Carolina',
        'Virginia','West Virginia','Maryland','Delaware', 'District of Columbia')
Northeast = ('Pennsylvania','New Jersey','New York','Massachusetts', 'Rhode Island','Conneticut','Vermont','New Hampshire','Maine')
NonCont = ('Hawaii','Alaska', 'Puerto Rico')

In [14]:
df4['Region'] = 'a'
df5 = df4.copy()

In [25]:
df5 = df5.drop(['index'], axis = 1)

In [15]:
def time_to_seconds(time_str):
    # Extract the time part from the string (ignore day, month, year)
    time_part = time_str.split('T')[1]
    h, m, s = map(int, time_part.split(':'))
    return h * 3600 + m * 60 + s

In [16]:
i = 0
imax = df4.shape[0]
#imax = 100

while i < imax:
    st = df4['Closest_State'][i]
    nt = df4['Timestamp'][i]
    
    df5['Timestamp'][i] = time_to_seconds(nt)
    
    if df5['Region'][i] == 'a':
        if st in West_Coast:
            df5['Region'][i] = 'West Coast'

        elif st in Southwest:
            df5['Region'][i] = 'Southwest'

        elif st in Rockies:
            df5['Region'][i] = 'Rockies'

        elif st in Midwest:
            df5['Region'][i] = 'Midwest'

        elif st in South:
            df5['Region'][i] = 'South'

        elif st in Northeast:
            df5['Region'][i] = 'Northeast'

        elif st in NonCont:
            df5['Region'][i] = 'NonCont'

        else:
            print(i)
            print(st)
            i = imax
        
    if i % 1000 == 0:
        clear_output()
        print('Rows processed: ' + str(i) + '/' + str(imax))
        
    i+= 1

Rows processed: 374000/374519


In [27]:
df_westcoast = pd.DataFrame(columns = df5.columns)
df_westcoast = df5[df5['Closest_State'].isin(West_Coast)]

df_southwest = pd.DataFrame(columns = df5.columns)
df_southwest = df5[df5['Closest_State'].isin(Southwest)]

df_rockies = pd.DataFrame(columns = df5.columns)
df_rockies = df5[df5['Closest_State'].isin(Rockies)]

df_midwest = pd.DataFrame(columns = df5.columns)
df_midwest = df5[df5['Closest_State'].isin(Midwest)]

df_south = pd.DataFrame(columns = df5.columns)
df_south = df5[df5['Closest_State'].isin(South)]

df_northeast = pd.DataFrame(columns = df5.columns)
df_northeast = df5[df5['Closest_State'].isin(Northeast)]

df_noncont = pd.DataFrame(columns = df5.columns)
df_noncont =df5[df5['Closest_State'].isin(NonCont)]

In [29]:
df_westcoast.to_csv('west_coast_tweets.csv', index = False)
df_southwest.to_csv('southwest_tweets.csv', index = False)
df_rockies.to_csv('rockies_tweets.csv', index = False)
df_midwest.to_csv('midwest_tweets.csv', index = False)
df_south.to_csv('south_tweets.csv', index = False)
df_northeast.to_csv('northeast_tweets.csv', index = False)
df_noncont.to_csv('noncont_tweets.csv', index = False)

In [30]:
df5.to_csv('All_US_tweets.csv', index = False)