## Recreating the old model used for Model Training

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import emoji

def preprocess_text(text):
    """
    Preprocesses the text by removing emojis, symbols, URLs, mentions, and punctuation.
    """
    if isinstance(text, str): #check if text is a string.
        text = emoji.demojize(text)  # Replace emojis with text descriptions
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation (except hashtags)
        text = re.sub(r'[^a-zA-Z\s#]', '', text) #remove symbols.
        text = text.lower()
        return text
    else:
        return "" #if it is not a string, return empty string.

def create_tfidf_features(df, text_column='text', max_features=100):
    """
    Creates TF-IDF features from the specified text column with improved preprocessing.
    """

    # Apply preprocessing to the text column
    df[text_column] = df[text_column].apply(preprocess_text)

    vectorizer = TfidfVectorizer(
        max_features=max_features,
        stop_words='english',
        token_pattern=r'\b\w+\b|\B#\w+\b'  # Include hashtags
    )
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

    return df,vectorizer


df = pd.read_csv("../data/scored_tweets_total.csv")

df_with_tfidf_features,vectorizer = create_tfidf_features(df.copy())

print(df_with_tfidf_features)

       Stock                   Datetime user.screen_name  \
0       AXTG  2021-03-25 17:50:13+00:00          UCitnow   
1       AXTG  2021-03-25 17:50:48+00:00          UCitnow   
2       AXTG  2021-03-25 18:47:48+00:00           Rad7RR   
3       AXTG  2021-03-25 20:02:03+00:00         GetScanz   
4       AXTG  2021-03-25 20:02:48+00:00  christinebarnum   
...      ...                        ...              ...   
10071  EEENF  2021-04-09 01:00:55+00:00      TVTVentures   
10072  EEENF  2021-04-09 01:06:47+00:00    BuyLowSell420   
10073  EEENF  2021-04-09 01:06:56+00:00      superlars34   
10074  EEENF  2021-04-09 01:07:55+00:00    DaveWhitman12   
10075  EEENF  2021-04-09 01:14:21+00:00       jerocker79   

                     id_str  \
0      1375142994920271872a   
1      1375143141058080768a   
2      1375157484063584261a   
3      1375176172099747845a   
4      1375176361560604679a   
...                     ...   
10071  1380324815689539585a   
10072  1380326289735708672a   


### Saving the model

In [3]:
import joblib

vectorizer_path = "../model_training/models/tfidf_vectorizer.joblib"

joblib.dump(vectorizer, vectorizer_path)

['../model_training/models/tfidf_vectorizer.joblib']

## Applying it onto new dataset

In [4]:
def create_tfidf_features_from_saved(df, vectorizer, text_column='Tweet'):
    """
    Creates TF-IDF features using a pre-trained vectorizer.
    """
    df[text_column] = df[text_column].apply(preprocess_text)

    tfidf_matrix = vectorizer.transform(df[text_column])
    tfidf_array = tfidf_matrix.toarray()  # Convert sparse matrix to dense array
    tfidf_df = pd.DataFrame(tfidf_array, columns=vectorizer.get_feature_names_out())

    # Concatenate the original DataFrame with the TF-IDF features
    df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

    return df

In [5]:
loaded_vectorizer = joblib.load('../model_training/models/tfidf_vectorizer.joblib')

# Load your new dataset
new_df = pd.read_csv("../data/stock_tweets_test_on_real_data.csv")

# Create TF-IDF features for the new dataset using the loaded vectorizer
new_df_with_tfidf = create_tfidf_features_from_saved(new_df.copy(), loaded_vectorizer) # Use .copy() to avoid modifying the original DataFrame

# Now new_df_with_tfidf will have the original columns of new_df
# plus the TF-IDF features as new columns.
print(new_df_with_tfidf.head())

                        Date  \
0  2022-09-29 23:41:16+00:00   
1  2022-09-29 23:24:43+00:00   
2  2022-09-29 23:18:08+00:00   
3  2022-09-29 22:40:07+00:00   
4  2022-09-29 22:27:05+00:00   

                                               Tweet Stock Name Company Name  \
0  mainstream media has done an amazing job at br...       TSLA  Tesla, Inc.   
1  tesla delivery estimates are at around k from ...       TSLA  Tesla, Inc.   
2   even if i include m unvested rsus as of  addi...       TSLA  Tesla, Inc.   
3     hahaha why are you still trying to stop tes...       TSLA  Tesla, Inc.   
4    stop trying to kill kids you sad deranged ol...       TSLA  Tesla, Inc.   

   #solana  aabb  ada       amp  athdoubleexclamationmark  azfl  ...  \
0      0.0   0.0  0.0  0.835086                       0.0   0.0  ...   
1      0.0   0.0  0.0  0.000000                       0.0   0.0  ...   
2      0.0   0.0  0.0  0.000000                       0.0   0.0  ...   
3      0.0   0.0  0.0  0.000000       

In [6]:
# After creating the new_df_with_tfidf DataFrame:

output_file_path = '../data/tfidf_data_with_new_tweets.csv'  # Choose your desired filename
new_df_with_tfidf.to_csv(output_file_path, index=False)  # index=False prevents writing the DataFrame index to the CSV

print(f"The new DataFrame with TF-IDF features has been saved to: {output_file_path}")

The new DataFrame with TF-IDF features has been saved to: ../data/tfidf_data_with_new_tweets.csv
