<div style="
    background-color: white;
    padding: 40px 40px;
    border-radius: 12px;
    box-shadow: 0 4px 12px rgba(255, 0, 0, 0.1);
    text-align: center;
    font-family: Arial, sans-serif;
    margin-top: 30px;
">
    <h1 style="
        color: #000000;
        font-size: 45px;
        font-weight: bold;
        margin: 0;
    ">
        Customer Input Data Cleaning Pipeline
    </h1>
</div>

## Import Libraries

In [23]:
import streamlit as st
import pandas as pd
import re
import emoji
import html
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


nltk.download('punkt')
nltk.download('stopwords')

STOP_WORDS = set(stopwords.words('english'))

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading Dataset

In [15]:
data = pd.read_csv(r"D:\crypto tweets dataset\crypto-query-tweets.csv")

In [22]:
data.isnull().sum()

date_time                 0
username                  0
user_location          4858
user_description        992
verified                  0
followers_count           0
following_count           0
tweet_like_count          0
tweet_retweet_count       0
tweet_reply_count         0
tweet_quote_count         0
text                      0
dtype: int64

## Cleaning Functions

In [16]:
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

def remove_hashtags(text):
    return re.sub(r'#', '', text)

def remove_html_entities(text):
    return html.unescape(text)

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def expand_contractions(text):
    contractions = {
        "can't": "cannot",
        "won't": "will not",
        "don't": "do not",
        "didn't": "did not",
        "it's": "it is",
        "i'm": "i am",
        "they're": "they are",
        "isn't": "is not",
        "aren't": "are not"
    }
    for k, v in contractions.items():
        text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
    return text

def normalize_repeated_chars(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

def remove_non_alpha(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in STOP_WORDS]
    return ' '.join(tokens)

def clean_tweet(text):
    try:
        word_count_before = len(text.split())

        text = remove_urls(text)
        text = remove_mentions(text)
        text = remove_hashtags(text)
        text = remove_html_entities(text)
        text = remove_emojis(text)
        text = expand_contractions(text)
        text = normalize_repeated_chars(text)
        text = remove_non_alpha(text)
        text = text.lower()
        text = tokenize_and_remove_stopwords(text)

        word_count_after = len(text.split())

        return text, word_count_before, word_count_after, True

    except Exception:
        return "", 0, 0, False

## Main Pipeline

In [19]:

def process_crypto_tweets(input_path, output_path):
    data = pd.read_csv(input_path)

    if 'text' not in data.columns:
        raise ValueError("Dataset must contain 'text' column")

    results = data['text'].apply(clean_tweet)

    data['cleaned_text'] = results.apply(lambda x: x[0])
    data['words_before'] = results.apply(lambda x: x[1])
    data['words_after'] = results.apply(lambda x: x[2])
    data['cleaning_success'] = results.apply(lambda x: x[3])

    data.to_csv(output_path, index=False)
    print("Crypto tweets cleaned successfully!")


In [31]:
if __name__ == "__main__":
    INPUT_FILE = r"D:\crypto tweets dataset\crypto-query-tweets.csv"
    OUTPUT_FILE = r"D:\crypto tweets dataset\cleaned_crypto_tweets.csv"

    process_crypto_tweets(INPUT_FILE, OUTPUT_FILE)

Crypto tweets cleaned successfully!
