In [1]:
import pandas as pd
import numpy as np

import pandas as pd

# Show full column content
pd.set_option("display.max_colwidth", None)

# Optional: control how many rows/columns are shown
# pd.set_option("display.max_rows", 100)   # default is 10
# pd.set_option("display.max_columns", 100)

In [2]:
from datasets import load_dataset

ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:
from datasets import load_dataset

ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")

train_df = ds["train"].to_pandas()
val_df = ds["validation"].to_pandas()
test_df = ds["test"].to_pandas()

In [5]:
train_df.head(5)

Unnamed: 0,text,label
0,"""QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin""",2
1,"""Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ""",1
2,Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.,1
3,Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays,1
4,"@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017""",2


In [None]:
# preprocessing.py

import re
from typing import List
import spacy
from sklearn.base import BaseEstimator, TransformerMixin
import emoji

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

class SpacyPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, lower=True, remove_urls=True, remove_mentions=True):
        self.lower = lower
        self.remove_urls = remove_urls
        self.remove_mentions = remove_mentions
        self.url_re = re.compile(r"http\S+|www\.\S+")
        self.mention_re = re.compile(r"@\w+")
    def clean(self, text: str) -> str:
        if self.remove_urls:
            text = self.url_re.sub("", text)
        if self.remove_mentions:
            text = self.mention_re.sub("", text)
        if self.lower:
            text = text.lower()
        
        # remove hastages
        text = re.sub(r"#", "", text)  # just remove '#' but keep word
        
        # 4. Convert emojis to text (using emoji library)
        text = emoji.demojize(text, delimiters=(" ", " "))  
        
        # 5. Normalize elongated words (soooo → soo)
        def reduce_lengthening(word):
            return re.sub(r"(.)\1{2,}", r"\1\1", word)  # keep max 2 repeats

        text = " ".join([reduce_lengthening(w) for w in text.split()])
        # 6. Remove special characters (optional, keep only words/emojis)
        text = re.sub(r"[^a-zA-Z0-9_\s]", "", text)
        # 7. Remove extra spaces
        text = re.sub(r"\s+", " ", text).strip()

        text = text.strip()
        return text

    def spacy_tokenize(self, text: str) -> List[str]:
        doc = nlp(text)
        tokens = [t.lemma_ for t in doc if not t.is_stop and not t.is_punct and not t.like_num and not t.like_url]
        return tokens
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        cleaned = [self.clean(str(x)) for x in X]
        # return joined tokens (TF-IDF vectorizer will handle splitting or we can pass pre-tokenized)
        return [" ".join(self.spacy_tokenize(t)) for t in cleaned]


In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")

text = "This is an example sentence, with numbers 123 and a URL https://example.com!"

doc = nlp(text)

print("All tokens:", [token.text for token in doc])
print("Stopwords:", [token.text for token in doc if token.is_stop])
print("Punctuation:", [token.text for token in doc if token.is_punct])
print("Numbers:", [token.text for token in doc if token.like_num])
print("URLs:", [token.text for token in doc if token.like_url])

All tokens: ['This', 'is', 'an', 'example', 'sentence', ',', 'with', 'numbers', '123', 'and', 'a', 'URL', 'https://example.com', '!']
Stopwords: ['This', 'is', 'an', 'with', 'and', 'a']
Punctuation: [',', '!']
Numbers: ['123']
URLs: ['https://example.com']


In [2]:
import emoji

text = "I love this movie 😍🔥 but the ending was 😢"

# Convert emojis to text description
text_with_desc = emoji.demojize(text)
print(text_with_desc)
# Output: "I love this movie :smiling_face_with_heart_eyes: :fire: but the ending was :crying_face:"

# Remove emojis completely
text_removed = emoji.replace_emoji(text, replace='')
print(text_removed)

I love this movie :smiling_face_with_heart-eyes::fire: but the ending was :crying_face:
I love this movie  but the ending was 


In [None]:
import joblib
from datasets import load_dataset
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score

from config import MODEL_PATH, RANDOM_SEED
from preprocessing import SpacyPreprocessor
import pandas as pd


def load_data(path):
    """
    Expects a CSV with columns: text, label
    Uses Hugging Face datasets library
    """

    ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")

    train_df = ds["train"].to_pandas()
    val_df = ds["validation"].to_pandas()
    test_df = ds["test"].to_pandas()

    combined_train_val_df=pd.concat((train_df,val_df),axis=0)

    return combined_train_val_df



def train(path_to_csv: str):
    dataset = load_data(path_to_csv)

    X = dataset["text"]
    y = dataset["label"]

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
    )

    # return X

# Define models
    models = {
        "logreg": LogisticRegression(max_iter=1000, random_state=RANDOM_SEED),
        "rf": RandomForestClassifier(random_state=RANDOM_SEED),
        "xgb": XGBClassifier(
            eval_metric="mlogloss",
            use_label_encoder=False,
            random_state=RANDOM_SEED
        ),
    }

    pipeline = Pipeline([
        ("pre", SpacyPreprocessor()),
        ("tfidf", TfidfVectorizer(ngram_range=(1, 2), max_features=30000)),
        ("clf", LogisticRegression())  # placeholder
    ])

    param_grid = [
        {   # Logistic Regression
            "clf": [models["logreg"]],
            "clf__C": [0.1, 1.0, 5.0],
        },
        {   # Random Forest
            "clf": [models["rf"]],
            "clf__n_estimators": [100, 300],
            "clf__max_depth": [None, 20],
        },
        {   # XGBoost
            "clf": [models["xgb"]],
            "clf__n_estimators": [200, 500],
            "clf__learning_rate": [0.1, 0.3],
            "clf__max_depth": [6, 10],
        },
    ]

    gs = GridSearchCV(
        pipeline,
        param_grid,
        cv=3,
        scoring="f1_macro",
        n_jobs=-1,
        verbose=2
    )
    gs.fit(X_train, y_train)

    print("Best params:", gs.best_params_)
    print("Best model:", gs.best_estimator_)

    preds = gs.predict(X_val)
    print(classification_report(y_val, preds))
    print("Macro F1:", f1_score(y_val, preds, average="macro"))

    # Save best estimator
    joblib.dump(gs.best_estimator_, MODEL_PATH)
    print("Saved model to", MODEL_PATH)


if __name__ == "__main__":
    import sys
    # train(sys.argv[1])
    train("text")  # e.g. python train.py data/train.csv
      # e.g. python train.py data/train.csv


Fitting 3 folds for each of 8 candidates, totalling 24 fits


KeyboardInterrupt: 

In [5]:
test_X

0       "QT @user In the original draft of the 7th boo...
1       "Ben Smith / Smith (concussion) remains out of...
2       Sorry bout the stream last night I crashed out...
3       Chase Headley's RBI double in the 8th inning o...
4       @user Alciato: Bee will invest 150 million in ...
                              ...                        
1995    "LONDON (AP) "" Prince George celebrates his s...
1996    Harper's Worst Offense against Refugees may be...
1997    Hold on... Sam Smith may do the theme to Spect...
1998    Gonna watch Final Destination 5 tonight. I alw...
1999    "Interview with Devon Alexander \""""Speed Kil...
Name: text, Length: 47615, dtype: object

In [10]:
def load_data(path):
    """
    Expects a CSV with columns: text, label
    Uses Hugging Face datasets library
    """

    ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")

    train_df = ds["train"].to_pandas()
    val_df = ds["validation"].to_pandas()
    test_df = ds["test"].to_pandas()

    combined_train_val_df=pd.concat((train_df,val_df),axis=0)

    return combined_train_val_df

df=load_data("test")

In [14]:
df[df['label']==0].iloc[0]['text']

'So disappointed in wwe summerslam! I want to see john cena wins his 16th title'