### Data importing

In [15]:
import networkx as nx
import pandas as pd
import numpy as np

In [19]:
data = pd.read_csv("data/scored_tweets_final_translated.csv")

### Sentiment analysis

In [20]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def ensure_vader_lexicon():
    """Ensures the VADER lexicon is downloaded."""
    try:
        nltk.data.find('vader_lexicon')
    except LookupError:
        print("Downloading VADER lexicon...")
        nltk.download('vader_lexicon')
        print("VADER lexicon downloaded successfully.")

ensure_vader_lexicon() # download the lexicon

# Initialize SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    """Calculates the compound sentiment score for a given text."""
    if isinstance(text, str):  # Handle NaN and other non-string values
        vs = analyzer.polarity_scores(text)
        return vs['compound']
    else:
        return 0  # Or any default value you prefer

def categorize_sentiment(score):
    """Categorizes sentiment score into positive, negative, or neutral."""
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

def analyze_sentiment_dataframe(df, text_column='text'):
    """Iterates through a DataFrame and adds sentiment scores and categories."""

    # Ensure the provided column exists
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in DataFrame.")

    sentiment_scores = []
    sentiment_categories = []

    for index, row in df.iterrows():
        text = row[text_column]
        score = get_sentiment_score(text)
        sentiment_scores.append(score)
        sentiment_categories.append(categorize_sentiment(score))

    df['sentiment_score'] = sentiment_scores #numerical sentiment
    df['sentiment_category'] = sentiment_categories #categorical sentiment

    return df


df = analyze_sentiment_dataframe(data, text_column='text')  # Use your actual DataFrame and column name

print(df)

Downloading VADER lexicon...
VADER lexicon downloaded successfully.


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dominiccheong/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


       Stock                   Datetime user.screen_name  \
0       AXTG  2021-03-25 17:50:13+00:00          UCitnow   
1       AXTG  2021-03-25 17:50:48+00:00          UCitnow   
2       AXTG  2021-03-25 18:47:48+00:00           Rad7RR   
3       AXTG  2021-03-25 20:02:03+00:00         GetScanz   
4       AXTG  2021-03-25 20:02:48+00:00  christinebarnum   
...      ...                        ...              ...   
10071  EEENF  2021-04-09 01:00:55+00:00      TVTVentures   
10072  EEENF  2021-04-09 01:06:47+00:00    BuyLowSell420   
10073  EEENF  2021-04-09 01:06:56+00:00      superlars34   
10074  EEENF  2021-04-09 01:07:55+00:00    DaveWhitman12   
10075  EEENF  2021-04-09 01:14:21+00:00       jerocker79   

                     id_str  \
0      1375142994920271872a   
1      1375143141058080768a   
2      1375157484063584261a   
3      1375176172099747845a   
4      1375176361560604679a   
...                     ...   
10071  1380324815689539585a   
10072  1380326289735708672a   


### Adding TFIDF

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import emoji

def preprocess_text(text):
    """
    Preprocesses the text by removing emojis, symbols, URLs, mentions, and punctuation.
    """
    if isinstance(text, str): #check if text is a string.
        text = emoji.demojize(text)  # Replace emojis with text descriptions
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation (except hashtags)
        text = re.sub(r'[^a-zA-Z\s#]', '', text) #remove symbols.
        text = text.lower()
        return text
    else:
        return "" #if it is not a string, return empty string.

def create_tfidf_features(df, text_column='text', max_features=100):
    """
    Creates TF-IDF features from the specified text column with improved preprocessing.
    """

    # Apply preprocessing to the text column
    df[text_column] = df[text_column].apply(preprocess_text)

    vectorizer = TfidfVectorizer(
        max_features=max_features,
        stop_words='english',
        token_pattern=r'\b\w+\b|\B#\w+\b'  # Include hashtags
    )
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

    return df,vectorizer

df_with_tfidf_features,vectorizer = create_tfidf_features(df.copy())
print(df_with_tfidf_features)

       Stock                   Datetime user.screen_name  \
0       AXTG  2021-03-25 17:50:13+00:00          UCitnow   
1       AXTG  2021-03-25 17:50:48+00:00          UCitnow   
2       AXTG  2021-03-25 18:47:48+00:00           Rad7RR   
3       AXTG  2021-03-25 20:02:03+00:00         GetScanz   
4       AXTG  2021-03-25 20:02:48+00:00  christinebarnum   
...      ...                        ...              ...   
10071  EEENF  2021-04-09 01:00:55+00:00      TVTVentures   
10072  EEENF  2021-04-09 01:06:47+00:00    BuyLowSell420   
10073  EEENF  2021-04-09 01:06:56+00:00      superlars34   
10074  EEENF  2021-04-09 01:07:55+00:00    DaveWhitman12   
10075  EEENF  2021-04-09 01:14:21+00:00       jerocker79   

                     id_str  \
0      1375142994920271872a   
1      1375143141058080768a   
2      1375157484063584261a   
3      1375176172099747845a   
4      1375176361560604679a   
...                     ...   
10071  1380324815689539585a   
10072  1380326289735708672a   


#### Saving tfidf_vectorizer

In [25]:
import joblib

joblib.dump(vectorizer, 'model_training/models/tfidf_vectorizer.joblib')

['model_training/models/tfidf_vectorizer.joblib']

### Adding centrality 

#### Mentioned handles

In [22]:
def extract_twitter_handles_from_dataframe(df):
  """
  Extracts Twitter handles from a pandas DataFrame, where usernames are in
  the 'user.screen_name' column and tweet text is in the 'text' column.

  Args:
    df: The pandas DataFrame.

  Returns:
    A pandas DataFrame with a new column 'mentioned_handles' containing lists of
    extracted Twitter handles.
  """

  def extract_handles(text):
    if isinstance(text, str): #handle nan cases
      pattern = r"@([a-zA-Z0-9_]+)"
      handles = re.findall(pattern, text)
      if len(handles) == 0:
        return None
      return handles
    else:
      return None

  df['mentioned_handles'] = df['text'].apply(extract_handles)
  return df

twitter_df_with_handles = extract_twitter_handles_from_dataframe(df_with_tfidf_features.copy())

In [27]:
twitter_df_with_handles.to_csv("data/scored_tweets_final_translated_with_TFIDF.csv",index = False)