In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt_tab')

# Load the dataset from tweets.csv
df = pd.read_csv('tweets.csv')

# Keep only the 'Text' column
df = df[['Text']]

# Remove rows with missing text
df.dropna(subset=['Text'], inplace=True)

# Remove duplicate rows based on 'Text'
df.drop_duplicates(subset=['Text'], inplace=True)

# Define a set of negative words to retain during stopword removal
negative_words = {"not", "no", "nor", "never", "nothing", "nowhere",
                  "neither", "cannot", "n't", "without", "barely", "hardly", "scarcely"}

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove special characters, numbers, and punctuations (keeping only letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords but retain the defined negative words
    tokens = [word for word in tokens if word not in stopwords.words('english') or word in negative_words]
    # Join tokens back into a cleaned string
    return ' '.join(tokens)

# Apply the cleaning function to the 'Text' column
df['cleaned_text'] = df['Text'].apply(clean_text)

# Display the first few rows of the cleaned dataset
print(df.head())

# Save the cleaned dataset to a new CSV file (optional)
df.to_csv('preprocessed_tweets.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                                Text  \
0  Excited to have Larry Ellison &amp; Kathleen W...   
1  When one of the interns working on the simulat...   
2                           @5AllanLeVito Got it 😀🇺🇦   
3  Yes. Supercharger coverage will extend to 100%...   
4                     @demishassabis Congratulations   

                                        cleaned_text  
0  excited larry ellison amp kathleen wilsonthomp...  
1          one interns working simulation drops ball  
2                                                got  
3  yes supercharger coverage extend europe next y...  
4                                    congratulations  
