In [1]:
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from termcolor import colored
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
# Download required NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Import datasets
print("Loading data")
train_data = pd.read_csv('/content/test.csv')
test_data = pd.read_csv('/content/test.csv')

Loading data


In [4]:
# Initialize stopwords and remove negations
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove("not")

In [5]:
# Preprocessing function
def clean_tweet(data):
    # Initialize Lemmatizer and Stemmer
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    # Define regular expressions
    user_handle_pattern = r"@[\w]*"
    url_pattern = r"(www\.[^\s]+)|(https?://[^\s]+)"
    special_char_pattern = r"[^a-zA-Z' ]"
    single_char_pattern = r"(^| ).( |$)"

    # Define contractions
    contractions = {"n't": "not"}

    # Function to expand contractions in a tweet
    def expand_contractions(tweet):
        expanded_tweet = []
        for word in tweet:
            for contraction, replacement in contractions.items():
                word = word.replace(contraction, replacement)
            expanded_tweet.append(word)
        return expanded_tweet

    # Cleaning process
    print(colored("Starting tweet preprocessing...", "yellow"))

    # Remove user handles
    data['Clean_tweet'] = data['Tweet'].str.replace(user_handle_pattern, "", regex=True)

    # Remove URLs
    data['Clean_tweet'] = data['Clean_tweet'].replace(re.compile(url_pattern), "", regex=True)

    # Remove special characters and numbers
    data['Clean_tweet'] = data['Clean_tweet'].str.replace(special_char_pattern, "", regex=True)

    # Remove single characters
    data['Clean_tweet'] = data['Clean_tweet'].replace(re.compile(single_char_pattern), " ", regex=True)

    # Tokenize words
    data['Clean_tweet'] = data['Clean_tweet'].str.lower().str.split()

    # Remove stopwords
    data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: [word for word in tweet if word not in STOPWORDS])

    # Expand contractions
    data['Clean_tweet'] = data['Clean_tweet'].apply(expand_contractions)

    # Lemmatize and stem words
    data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: [lemmatizer.lemmatize(word) for word in tweet])
    data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: [stemmer.stem(word) for word in tweet])

    # Recombine tokens into a single tweet string
    data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: ' '.join(tweet))

    print(colored("Tweet preprocessing complete!", "green"))
    return data

In [7]:
# Preprocess and save cleaned data
print(colored("Processing train data...", "blue"))
train_data = clean_tweet(train_data)
train_data.to_csv('/content/clean_train.csv', index=False)
print(colored("Train data processed and saved to data/clean_train.csv", "green"))

print(colored("Processing test data...", "blue"))
test_data = clean_tweet(test_data)
test_data.to_csv('/content/clean_test.csv', index=False)
print(colored("Test data processed and saved to data/clean_test.csv", "green"))

Processing train data...
Starting tweet preprocessing...
Tweet preprocessing complete!
Train data processed and saved to data/clean_train.csv
Processing test data...
Starting tweet preprocessing...
Tweet preprocessing complete!
Test data processed and saved to data/clean_test.csv
