In [1]:
import pandas as pd
import re
import emoji
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

In [2]:
normal_data = pd.read_csv(r"C:\Users\limti\PycharmProjects\DSA4263_StockTweets\data\normal_data.csv")
tfidf_data = pd.read_csv(r"C:\Users\limti\PycharmProjects\DSA4263_StockTweets\data\tfidf_data.csv")

In [3]:
def count_hashtags(text):
    if isinstance(text, str):
        return len(re.findall(r'#\w+', text))
    return 0

def count_emojis(text):
    if isinstance(text, str):
        return len([char for char in text if char in emoji.EMOJI_DATA])
    return 0


# Apply the functions to your text column
normal_data['hashtag_count'] = normal_data['text'].apply(count_hashtags) #replace text_column
normal_data['emoji_count'] = normal_data['text'].apply(count_emojis) #replace text_column

# Display the DataFrame with the new columns
print(normal_data[['text', 'hashtag_count', 'emoji_count']].head())

                                                text  hashtag_count  \
0                    #rt  rt    axtg new intern ceo               1   
1                rt  #rt  rt    axtg new intern ceo               1   
2             axtg big otc  gainerrocketrocketrocket              0   
3  todays top penny stock gainers\r\n\r\nlttgf ax...              1   
4  rt  todays top penny stock gainers\r\n\r\nlttg...              3   

   emoji_count  
0            0  
1            0  
2            0  
3            0  
4            0  


In [4]:
data = normal_data[["text", "Known_Pumper"]]
print(data.head())

                                                text  Known_Pumper
0                    #rt  rt    axtg new intern ceo              0
1                rt  #rt  rt    axtg new intern ceo              0
2             axtg big otc  gainerrocketrocketrocket             0
3  todays top penny stock gainers\r\n\r\nlttgf ax...             0
4  rt  todays top penny stock gainers\r\n\r\nlttg...             0


In [18]:
import pandas as pd
import re
import emoji
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, chi2  # Add feature selection

def count_hashtags(text):
    if isinstance(text, str):
        return len(re.findall(r'#\w+', text))
    return 0

def count_emojis(text):
    if isinstance(text, str):
        return len([char for char in text if char in emoji.EMOJI_DATA])
    return 0

def preprocess_text(text):
    if isinstance(text, str):
        text = emoji.demojize(text)
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'[^\w\s#]', '', text)
        text = re.sub(r'[^a-zA-Z\s#]', '', text)
        text = text.lower()
        return text
    else:
        return ""

def align_tfidf_columns(train_tfidf_df, test_tfidf_df):
    """Aligns TF-IDF columns between training and test DataFrames."""
    train_cols = set(train_tfidf_df.columns)
    test_cols = set(test_tfidf_df.columns)

    common_cols = list(train_cols & test_cols)
    missing_train_cols = list(test_cols - train_cols)
    missing_test_cols = list(train_cols - test_cols)

    for col in missing_train_cols:
        train_tfidf_df[col] = 0

    for col in missing_test_cols:
        test_tfidf_df[col] = 0

    return train_tfidf_df[common_cols + missing_test_cols], test_tfidf_df[common_cols + missing_train_cols]

def create_ml_pipeline_with_tfidf_oversampling(df, classifier, text_column='text', target_column='Known_Pumper', numerical_columns=["emoji_count", "hashtag_count"], oversampling=True, max_tfidf_features=1500, param_grid=None): #reduced max_tfidf_features
    """
    Creates a machine learning pipeline with TF-IDF, emoji/hashtag counts, and optional SMOTE oversampling.
    Returns metrics for both train and test sets.
    """

    df['hashtag_count'] = df[text_column].apply(count_hashtags)
    df['emoji_count'] = df[text_column].apply(count_emojis)
    df[text_column] = df[text_column].apply(preprocess_text)

    X = df.drop(target_column, axis=1)
    y = df[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    vectorizer = TfidfVectorizer(max_features=max_tfidf_features, stop_words='english', ngram_range=(1, 2)) #added ngram
    train_tfidf_matrix = vectorizer.fit_transform(X_train[text_column])
    test_tfidf_matrix = vectorizer.transform(X_test[text_column])

    train_tfidf_df = pd.DataFrame(train_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    test_tfidf_df = pd.DataFrame(test_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    train_tfidf_df, test_tfidf_df = align_tfidf_columns(train_tfidf_df, test_tfidf_df)

    X_train = X_train.reset_index(drop=True)
    train_tfidf_df = train_tfidf_df.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    test_tfidf_df = test_tfidf_df.reset_index(drop=True)
    
    X_train = pd.concat([X_train.drop(text_column, axis=1), train_tfidf_df], axis=1)
    X_test = pd.concat([X_test.drop(text_column, axis=1), test_tfidf_df], axis=1)


    tfidf_columns = [col for col in X_train.columns if col not in numerical_columns and col != 'hashtag_count' and col != 'emoji_count']

    # Feature selection
    feature_selector = SelectKBest(chi2, k=min(200, len(tfidf_columns))) #added feature selection
    X_train_selected = feature_selector.fit_transform(X_train[tfidf_columns], y_train)
    X_test_selected = feature_selector.transform(X_test[tfidf_columns])

    selected_tfidf_columns = [tfidf_columns[i] for i in feature_selector.get_support(indices=True)]
    X_train_selected_df = pd.DataFrame(X_train_selected, columns=selected_tfidf_columns)
    X_test_selected_df = pd.DataFrame(X_test_selected, columns=selected_tfidf_columns)

    X_train_final = pd.concat([X_train[numerical_columns].reset_index(drop=True), X_train_selected_df.reset_index(drop=True)], axis=1)
    X_test_final = pd.concat([X_test[numerical_columns].reset_index(drop=True), X_test_selected_df.reset_index(drop=True)], axis=1)

    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', MinMaxScaler(), numerical_columns + ['hashtag_count', 'emoji_count']),
            ('tfidf', 'passthrough', selected_tfidf_columns) #use selected columns
        ],
        remainder='drop'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

    if param_grid is not None:
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', verbose=2, n_jobs=3)
        grid_search.fit(X_train_final, y_train)
        pipeline = grid_search.best_estimator_

    else:
        pipeline.fit(X_train_final, y_train)

    y_train_pred = pipeline.predict(X_train_final)
    y_test_pred = pipeline.predict(X_test_final)

    train_metrics = {
        'accuracy': accuracy_score(y_train, y_train_pred),
        'precision': precision_score(y_train, y_train_pred),
        'recall': recall_score(y_train, y_train_pred),
        'f1': f1_score(y_train, y_train_pred),
        'roc_auc': roc_auc_score(y_train, pipeline.predict_proba(X_train_final)[:, 1]),
        'confusion_matrix': confusion_matrix(y_train, y_train_pred)
    }

    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred),
        'recall': recall_score(y_test, y_test_pred),
        'f1': f1_score(y_test, y_test_pred),
        'roc_auc': roc_auc_score(y_test, pipeline.predict_proba(X_test_final)[:, 1]),
        'confusion_matrix': confusion_matrix(y_test, y_test_pred)
    }
    
    return pipeline, vectorizer, train_metrics, test_metrics

In [19]:
data.columns

Index(['text', 'Known_Pumper', 'hashtag_count', 'emoji_count'], dtype='object')

In [20]:
from sklearn.linear_model import LogisticRegression

param_grid_logistic = {
    'classifier__penalty': ['l1', 'l2'],  # Regularization type
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'classifier__solver': ['liblinear', 'saga'],  # Solver algorithm
    'classifier__class_weight': [None, 'balanced'] #Handle Imbalance
}

pipeline, vectorizer, train_metrics, test_metrics = create_ml_pipeline_with_tfidf_oversampling(data, classifier = LogisticRegression(), param_grid = param_grid_logistic)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashtag_count'] = df[text_column].apply(count_hashtags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['emoji_count'] = df[text_column].apply(count_emojis)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df[text_column].apply(preprocess_text)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [21]:
train_metrics

{'accuracy': 0.8665594855305466,
 'precision': 0.8740157480314961,
 'recall': 0.8565916398713826,
 'f1': 0.8652159792140305,
 'roc_auc': np.float64(0.9445279928867567),
 'confusion_matrix': array([[6815,  960],
        [1115, 6660]])}

In [22]:
test_metrics

{'accuracy': 0.8571428571428571,
 'precision': 0.1461794019933555,
 'recall': 0.5866666666666667,
 'f1': 0.23404255319148937,
 'roc_auc': np.float64(0.8131616005495448),
 'confusion_matrix': array([[1684,  257],
        [  31,   44]])}

In [24]:
import os
import pickle
pipeline_path = os.path.join(r"C:\Users\limti\PycharmProjects\DSA4263_StockTweets\model_training\tfidf_training", "lr_pipeline.pkl")
vectorizer_path = os.path.join(r"C:\Users\limti\PycharmProjects\DSA4263_StockTweets\model_training\tfidf_training", "lr_vectorizer.pkl")

with open(pipeline_path, 'wb') as p:
    pickle.dump(pipeline, p)

with open(vectorizer_path, 'wb') as v:
    pickle.dump(vectorizer, v)

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid_gradient_boosting = {
    'classifier__n_estimators': [100, 200],  # Reduced number of estimators
    'classifier__learning_rate': [0.05, 0.1],  # Reduced learning rate options
    'classifier__max_depth': [3, 4],  # Reduced depth options
    'classifier__subsample': [0.9, 1.0],  # Reduced subsample options
    'classifier__max_features': ['sqrt', None] # Reduced number of features to consider.
}


pipeline, vectorizer, train_metrics, test_metrics = create_ml_pipeline_with_tfidf_oversampling(data, classifier = GradientBoostingClassifier(), param_grid=param_grid_gradient_boosting)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashtag_count'] = df[text_column].apply(count_hashtags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['emoji_count'] = df[text_column].apply(count_emojis)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df[text_column].apply(preprocess_text)


Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [26]:
train_metrics

{'accuracy': 0.934983922829582,
 'precision': 0.9341463414634147,
 'recall': 0.9359485530546624,
 'f1': 0.9350465788628333,
 'roc_auc': np.float64(0.9809925455692147),
 'confusion_matrix': array([[7262,  513],
        [ 498, 7277]])}

In [27]:
test_metrics

{'accuracy': 0.9047619047619048,
 'precision': 0.193717277486911,
 'recall': 0.49333333333333335,
 'f1': 0.2781954887218045,
 'roc_auc': np.float64(0.8406113687102867),
 'confusion_matrix': array([[1787,  154],
        [  38,   37]])}

In [28]:
import os
import pickle
pipeline_path = os.path.join(r"C:\Users\limti\PycharmProjects\DSA4263_StockTweets\model_training\tfidf_training", "gb_pipeline.pkl")
vectorizer_path = os.path.join(r"C:\Users\limti\PycharmProjects\DSA4263_StockTweets\model_training\tfidf_training", "gb_vectorizer.pkl")

with open(pipeline_path, 'wb') as p:
    pickle.dump(pipeline, p)

with open(vectorizer_path, 'wb') as v:
    pickle.dump(vectorizer, v)

In [29]:
data["Known_Pumper"]

0        0
1        0
2        0
3        0
4        0
        ..
10071    0
10072    0
10073    0
10074    0
10075    0
Name: Known_Pumper, Length: 10076, dtype: int64

In [None]:
from sklearn.ensemble import RandomFor