In [1]:
!pip install emoji




In [2]:
import re
import unicodedata
import nltk
import emoji
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('wordnet')
nltk.download('punkt')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load dataframe
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/pfas_project/data/df_full_sorted.csv')
df.head()

Unnamed: 0,UserID,Timestamp,Latitude,Longitude,TweetText,Closest_State,Closest_City
0,USER_28e6d0a1,2010-03-04T02:50:29,33.580862,-86.956455,@USER_148a266e I gotta get you re-added to BBM!,Alabama,Adamsville
1,USER_28e6d0a1,2010-03-04T04:33:59,33.580862,-86.956455,@USER_5c07acb0 @USER_9334f9b7 ahhh yes!,Alabama,Adamsville
2,USER_28e6d0a1,2010-03-04T04:45:31,33.580862,-86.956455,@USER_2594d45f An old locksmith...,Alabama,Adamsville
3,USER_28e6d0a1,2010-03-05T00:44:08,33.580862,-86.956455,RT @USER_f1966b04: They are shooting at pentag...,Alabama,Adamsville
4,USER_28e6d0a1,2010-03-05T00:45:50,33.580862,-86.956455,@USER_80024f73 as a matter of fact... I wanna ...,Alabama,Adamsville


In [4]:
# Function for synonym replacement using NLTK
def synonym_replacement(text):
    words = nltk.word_tokenize(text)  # Tokenize the text
    new_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:  # If synonyms exist
            # Get the first synonym's lemma
            synonym = synonyms[0].lemmas()[0].name()
            new_words.append(synonym.replace('_', ' '))  # Replace underscores in multi-word synonyms
        else:
            new_words.append(word)  # Keep the original word if no synonym found
    return ' '.join(new_words)

In [5]:
# Remove Duplicates
df = df.drop_duplicates(subset=['TweetText'])

# Limit Tweets Per User
max_tweets_per_user = 100
df = df.groupby('UserID').head(max_tweets_per_user).reset_index(drop=True)

# Remove Cities With Less Than 50 Tweets
city_counts = df['Closest_City'].value_counts()
threshold = 50  # Set a threshold, e.g., cities with less than 50 tweets
frequent_cities = city_counts[city_counts >= threshold].index

df = df[df['Closest_City'].isin(frequent_cities)]

# Drop columns
df = df.drop(['UserID', 'Timestamp', 'Latitude', 'Longitude'], axis=1)

df.head()

Unnamed: 0,TweetText,Closest_State,Closest_City
9,RT @USER_50b86a99: my nuts.. your mouth.&lt; #...,Alabama,Athens
10,"@USER_0ed17a6b yea mi mommy is buyn my ticket,...",Alabama,Athens
11,#irefuse to let a jawn get outta pocket! I run...,Alabama,Athens
12,#irefuse to fall in love wit dem! I jus spit g...,Alabama,Athens
13,@USER_2fd07650 yea dats him. Add da bul,Alabama,Athens


In [6]:
min_city_size = df['Closest_City'].value_counts().min()

balanced_df = df.groupby('Closest_City').apply(lambda x: x.sample(min_city_size)).reset_index(drop=True)

  balanced_df = df.groupby('Closest_City').apply(lambda x: x.sample(min_city_size)).reset_index(drop=True)


In [8]:
def remove_accents(text):
    # Normalize the text to decompose accented characters into their base forms
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

def remove_non_ascii(text):
    # Keep only ASCII characters
    return ''.join([char for char in text if ord(char) < 128])

def remove_special_characters(text):
    # Remove any special characters that are not letters, numbers, or spaces
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

def convert_emojis_to_text(text):
    # Convert emojis to their text representation
    return emoji.demojize(text)

def clean_tweet_text(text):
    # Convert to string to handle non-string values
    text = str(text)
    text = re.sub(r'http\S+|www.\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)              # remove mentions
    text = re.sub(r'#\w+', '', text)              # remove hashtags
    text = remove_accents(text)                   # Normalize accented characters
    text = remove_non_ascii(text)                 # Remove non-ASCII characters
    text = remove_special_characters(text)        # Remove special characters
    text = convert_emojis_to_text(text)          # Convert emojis to text
    return text.lower()                            # Lowercase text

# Apply the cleaning function to the TweetText column
balanced_df['TweetText'] = balanced_df['TweetText'].apply(clean_tweet_text)

# Apply text augmentation
balanced_df['TweetText'] = balanced_df['TweetText'].apply(synonym_replacement)

# Remove rows with NaN values in 'TweetText'
df = df.dropna(subset=['TweetText'])

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to the top 5000 features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['TweetText'])

# Convert TF-IDF matrix to a DataFrame (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the cleaned data
balanced_df.head()

print(tfidf_df.head())

    00  000  005483   01   02   03   04  040127   05   06  ...  yuuup  zero  \
0  0.0  0.0     0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  ...    0.0   0.0   
1  0.0  0.0     0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  ...    0.0   0.0   
2  0.0  0.0     0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  ...    0.0   0.0   
3  0.0  0.0     0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  ...    0.0   0.0   
4  0.0  0.0     0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  ...    0.0   0.0   

   zone  zoo   ªã   ²à   ºâ   à¹  ì²ì   ï¼  
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 5000 columns]


In [9]:
balanced_df.to_csv('/content/drive/MyDrive/Colab Notebooks/pfas_project/data/balanced2_df.csv', index=False)