In [55]:
import pandas as pd
import re
import emoji
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

In [2]:
normal_data = pd.read_csv(r"C:\Users\limti\PycharmProjects\DSA4263_StockTweets\data\normal_data.csv")
tfidf_data = pd.read_csv(r"C:\Users\limti\PycharmProjects\DSA4263_StockTweets\data\tfidf_data.csv")

In [14]:
def count_hashtags(text):
    if isinstance(text, str):
        return len(re.findall(r'#\w+', text))
    return 0

def count_emojis(text):
    if isinstance(text, str):
        return len([char for char in text if char in emoji.EMOJI_DATA])
    return 0


# Apply the functions to your text column
normal_data['hashtag_count'] = normal_data['text'].apply(count_hashtags) #replace text_column
normal_data['emoji_count'] = normal_data['text'].apply(count_emojis) #replace text_column

# Display the DataFrame with the new columns
print(normal_data[['text', 'hashtag_count', 'emoji_count']].head())

                                                text  hashtag_count  \
0                    #rt  rt    axtg new intern ceo               1   
1                rt  #rt  rt    axtg new intern ceo               1   
2             axtg big otc  gainerrocketrocketrocket              0   
3  todays top penny stock gainers\r\n\r\nlttgf ax...              1   
4  rt  todays top penny stock gainers\r\n\r\nlttg...              3   

   emoji_count  
0            0  
1            0  
2            0  
3            0  
4            0  


In [33]:
data = normal_data[["text", "Known_Pumper"]]
print(data.head())

                                                text  Known_Pumper
0                    #rt  rt    axtg new intern ceo              0
1                rt  #rt  rt    axtg new intern ceo              0
2             axtg big otc  gainerrocketrocketrocket             0
3  todays top penny stock gainers\r\n\r\nlttgf ax...             0
4  rt  todays top penny stock gainers\r\n\r\nlttg...             0


In [60]:
def count_hashtags(text):
    if isinstance(text, str):
        return len(re.findall(r'#\w+', text))
    return 0

def count_emojis(text):
    if isinstance(text, str):
        return len([char for char in text if char in emoji.EMOJI_DATA])
    return 0

def preprocess_text(text):
    if isinstance(text, str):
        text = emoji.demojize(text)
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'[^\w\s#]', '', text)
        text = re.sub(r'[^a-zA-Z\s#]', '', text)
        text = text.lower()
        return text
    else:
        return ""

def align_tfidf_columns(train_tfidf_df, test_tfidf_df):
    """Aligns TF-IDF columns between training and test DataFrames."""
    train_cols = set(train_tfidf_df.columns)
    test_cols = set(test_tfidf_df.columns)

    common_cols = list(train_cols & test_cols)
    missing_train_cols = list(test_cols - train_cols)
    missing_test_cols = list(train_cols - test_cols)

    for col in missing_train_cols:
        train_tfidf_df[col] = 0

    for col in missing_test_cols:
        test_tfidf_df[col] = 0

    return train_tfidf_df[common_cols + missing_test_cols], test_tfidf_df[common_cols + missing_train_cols]

def create_ml_pipeline_with_tfidf_oversampling(df, classifier, text_column='text', target_column='Known_Pumper', numerical_columns=["emoji_count", "hashtag_count"], oversampling=True, max_tfidf_features=100):
    """
    Creates a machine learning pipeline with TF-IDF, emoji/hashtag counts, and optional SMOTE oversampling.
    Returns metrics for both train and test sets.
    """

    # Feature Engineering (Emoji and Hashtag Counts)
    df['hashtag_count'] = df[text_column].apply(count_hashtags)
    df['emoji_count'] = df[text_column].apply(count_emojis)

    # Text Preprocessing for TF-IDF
    df[text_column] = df[text_column].apply(preprocess_text)

    # Separate features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=max_tfidf_features, stop_words='english')
    train_tfidf_matrix = vectorizer.fit_transform(X_train[text_column])
    test_tfidf_matrix = vectorizer.transform(X_test[text_column])

    train_tfidf_df = pd.DataFrame(train_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    test_tfidf_df = pd.DataFrame(test_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    # Align TF-IDF Columns
    train_tfidf_df, test_tfidf_df = align_tfidf_columns(train_tfidf_df, test_tfidf_df)

    # Feature Concatenation
    X_train = X_train.reset_index(drop=True)
    train_tfidf_df = train_tfidf_df.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    test_tfidf_df = test_tfidf_df.reset_index(drop=True)
    
    X_train = pd.concat([X_train.drop(text_column, axis=1), train_tfidf_df], axis=1)
    X_test = pd.concat([X_test.drop(text_column, axis=1), test_tfidf_df], axis=1)

    # Oversampling using SMOTE (if enabled)
    if oversampling:
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train

    # Identify TF-IDF columns
    tfidf_columns = [col for col in X_train.columns if col not in numerical_columns and col != 'hashtag_count' and col != 'emoji_count']

    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', MinMaxScaler(), numerical_columns + ['hashtag_count', 'emoji_count']),
            ('tfidf', 'passthrough', tfidf_columns)
        ],
        remainder='drop'
    )

    # Full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

    # Train the pipeline
    pipeline.fit(X_train_resampled, y_train_resampled)

    # Make predictions
    y_train_pred = pipeline.predict(X_train_resampled)
    y_test_pred = pipeline.predict(X_test)

    # Calculate metrics
    train_metrics = {
        'accuracy': accuracy_score(y_train_resampled, y_train_pred),
        'precision': precision_score(y_train_resampled, y_train_pred),
        'recall': recall_score(y_train_resampled, y_train_pred),
        'f1': f1_score(y_train_resampled, y_train_pred),
        'roc_auc': roc_auc_score(y_train_resampled, pipeline.predict_proba(X_train_resampled)[:, 1]),
        'confusion_matrix': confusion_matrix(y_train_resampled, y_train_pred)
    }

    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred),
        'recall': recall_score(y_test, y_test_pred),
        'f1': f1_score(y_test, y_test_pred),
        'roc_auc': roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]),
        'confusion_matrix': confusion_matrix(y_test, y_test_pred)
    }
    
    return pipeline, vectorizer, train_metrics, test_metrics


In [64]:
from sklearn.linear_model import LogisticRegression
pipeline, vectorizer, train_metrics, test_metrics = create_ml_pipeline_with_tfidf_oversampling(data, classifier = LogisticRegression())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashtag_count'] = df[text_column].apply(count_hashtags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['emoji_count'] = df[text_column].apply(count_emojis)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df[text_column].apply(preprocess_text)


In [68]:
from sklearn.ensemble import GradientBoostingClassifier
pipeline, vectorizer, train_metrics, test_metrics = create_ml_pipeline_with_tfidf_oversampling(data, classifier = GradientBoostingClassifier())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashtag_count'] = df[text_column].apply(count_hashtags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['emoji_count'] = df[text_column].apply(count_emojis)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df[text_column].apply(preprocess_text)


In [69]:
train_metrics

{'accuracy': 0.8439871382636656,
 'precision': 0.8258009501766354,
 'recall': 0.8718971061093248,
 'f1': 0.8482232232232232,
 'roc_auc': np.float64(0.9249220748338003),
 'confusion_matrix': array([[6345, 1430],
        [ 996, 6779]])}

In [70]:
test_metrics

{'accuracy': 0.7966269841269841,
 'precision': 0.11316397228637413,
 'recall': 0.6533333333333333,
 'f1': 0.19291338582677164,
 'roc_auc': np.float64(0.8173278378842521),
 'confusion_matrix': array([[1557,  384],
        [  26,   49]])}

In [67]:
pipeline

In [72]:
data["Known_Pumper"].sum()

np.int64(360)