In [None]:
import pandas as pd

df = pd.read_csv("TweetSentiment.csv", encoding="ISO-8859-1")[["text", "sentiment"]]

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob

contractions = {
    "don't": "do not", "doesn't": "does not", "can't": "cannot", "i'm": "i am",
    "you're": "you are", "he's": "he is", "she's": "she is", "it's": "it is",
    "we're": "we are", "they're": "they are", "isn't": "is not", "aren't": "are not",
    "wasn't": "was not", "weren't": "were not", "won't": "will not", "wouldn't": "would not",
    "couldn't": "could not", "shouldn't": "should not", "i've": "i have", "you've": "you have",
    "we've": "we have", "they've": "they have", "i'll": "i will", "you'll": "you will",
    "he'll": "he will", "she'll": "she will", "we'll": "we will", "they'll": "they will",
    "there's": "there is", "that's": "that is", "what's": "what is", "who's": "who is"
}

emoticon_dict = {
    r"(:-\)|:\)|=\)|:\]|=])": "SMILE",
    r"(;-?\)|;-?\])": "WINK",
    r"(:D|=D|;D)": "LAUGH",
    r"(:\(|:-\(|=\[|:\[)": "SAD",
    r"(:\/|:-\/)": "SKEPTICAL",
    r"(<3)": "HEART",
    r"(:3)": "CUTE",
    r"(:P|:p|:-P|:-p|=P)": "PLAYFUL",
    r"(:=)": "CONFUSED",
}

def expand_contractions_fun(text):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions.keys()) + r')\b')
    return pattern.sub(lambda x: contractions[x.group()], text)

def reduce_elongation_fun(word):
    return re.sub(r'(.)\1{2,}', r'\1\1', word)

def preprocessing(
    df,
    text_col="text",
    lowercase=True,
    expand_contractions=True,
    remove_urls=True,
    emoticon_normalization=True,
    remove_mentions=True,
    remove_punctuation=True,
    preserve_ellipsis=True,
    remove_numbers=False,
    remove_non_ascii=True,
    reduce_elongation=True,
    remove_stopwords=True,
    stemming=False,
    lemmatization=False,
    spelling_correction=False,
):
    stop_words = set(stopwords.words("english")) if remove_stopwords else set()
    stemmer = PorterStemmer() if stemming else None
    lemmatizer = WordNetLemmatizer() if lemmatization else None

    def clean_text(text):
        if lowercase:
            text = text.lower()
        if expand_contractions:
            text = expand_contractions_fun(text)
        if remove_urls:
            text = re.sub(r"http\S+|www\S+|https\S+", "", text)

        if emoticon_normalization:
            for pattern, token in emoticon_dict.items():
                text = re.sub(pattern, token, text, flags=re.IGNORECASE)

        if remove_mentions: # maybe remove this
            text = re.sub(r"@\w+", "", text)

        if remove_punctuation:
            if preserve_ellipsis:
                text = text.replace("...", "<ELLIPSIS>")
            text = text.translate(str.maketrans("", "", string.punctuation))
            if preserve_ellipsis:
                text = text.replace("<ELLIPSIS>", "...")

        if remove_numbers:
            text = re.sub(r"\d+", "", text)
        if remove_non_ascii:
            text = text.encode("ascii", errors="ignore").decode()

        tokens = text.split()

        if reduce_elongation:
            tokens = [reduce_elongation_fun(word) for word in tokens]
        if remove_stopwords:
            tokens = [word for word in tokens if word not in stop_words]

        text = " ".join(tokens)

        if spelling_correction:
            text = str(TextBlob(text).correct())
            tokens = text.split()

        if stemming:
            tokens = [stemmer.stem(word) for word in tokens]
        if lemmatization:
            tokens = [lemmatizer.lemmatize(word) for word in tokens]

        text = " ".join(tokens)
        return text

    df[text_col] = df[text_col].astype(str).apply(clean_text)
    return df


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/desjardins/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/desjardins/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
