In [None]:
import re
from emoji import demojize
from nltk.tokenize import TweetTokenizer

class TweetNormalizer:
    def __init__(self, lowercase=True, remove_numbers=False, max_length=280, preserve_case_entities=True):
        self.tokenizer = TweetTokenizer(preserve_case=True)  # Always preserve case in tokenization
        self.lowercase = lowercase
        self.remove_numbers = remove_numbers
        self.max_length = max_length
        self.preserve_case_entities = preserve_case_entities

    def normalizeToken(self, token):
        if token.startswith("@"):
            return "@USER" if not self.preserve_case_entities else token
        elif token.lower().startswith("http") or token.lower().startswith("www"):
            return "HTTPURL"
        elif len(token) == 1:
            return demojize(token)
        elif self.remove_numbers and token.isdigit():
            return "NUMBER"
        else:
            if token == "'":
                return "'"
            elif token == "…":
                return "..."
            else:
                return token.lower() if self.lowercase else token

    def normalizeTweet(self, tweet):
        # Truncate tweet if it exceeds max_length
        tweet = tweet[:self.max_length]
        
        tokens = self.tokenizer.tokenize(tweet.replace("'", "'").replace("…", "..."))
        normTweet = " ".join([self.normalizeToken(token) for token in tokens])

        # Enhanced contraction handling
        contractions = {
            "cannot": "can not",
            "n't": " n't",
            "n 't": " n't",
            "ca n't": "can't",
            "ai n't": "ain't",
            "'m": " 'm",
            "'re": " 're",
            "'s": " 's",
            "'ll": " 'll",
            "'d": " 'd",
            "'ve": " 've"
        }
        for contraction, replacement in contractions.items():
            normTweet = normTweet.replace(contraction, replacement)

        # Time format normalization
        time_formats = [
            (" p . m .", " p.m."),
            (" p . m ", " p.m "),
            (" a . m .", " a.m."),
            (" a . m ", " a.m ")
        ]
        for time_format, replacement in time_formats:
            normTweet = normTweet.replace(time_format, replacement)

        # Remove extra spaces
        normTweet = " ".join(normTweet.split())

        return normTweet

if __name__ == "__main__":
    test_tweet = "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @PostAndCourier"

    normalizers = [
        TweetNormalizer(lowercase=True, remove_numbers=False, max_length=280, preserve_case_entities=True),
        TweetNormalizer(lowercase=False, remove_numbers=True, max_length=40, preserve_case_entities=False),
        TweetNormalizer(lowercase=True, remove_numbers=True, max_length=200, preserve_case_entities=True)
    ]

    for i, normalizer in enumerate(normalizers, 1):
        print(f"\nNormalizer {i} output:")
        print(normalizer.normalizeTweet(test_tweet))