## Text Preprocessing for Sentiment Analysis
This Python script performs essential text preprocessing on social media data collected from the StockTwits platform. The preprocessing includes tasks such as punctuation removal, lowercase conversion, handling contractions, removing tickers and user mentions, eliminating URLs, handling repeated characters, and removing extra whitespaces. The preprocessed data is prepared for sentiment analysis using the FinBERT sentiment scoring model, enabling accurate and insightful analysis of social media sentiment's impact on stock prices.

## This is working fine 

In [70]:
import os 
import re
import string
import contractions
import pandas as pd

def preprocess_text(text):
    # Remove cashtags and ticker symbols (e.g., $PG)
    text = re.sub(r'[$][A-Za-z][\S]*', '', text)
    
    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # Lowercase
    text = text.lower()

    # Handle apostrophes and contractions
    text = contractions.fix(text)


    # Remove user mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove all emojis using regular expression
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Handle repeated characters (simple spell corrector)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Input file path
input_file_path = r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\NVDA\NVDA_4\sorted_Nvidia_4.csv'

# Output file name
output_file_name = 'processed_Nvidia_4.csv'

# Get the folder path of the input file
folder_path = os.path.dirname(input_file_path)

# Full path for the output file
output_file_path = os.path.join(folder_path, output_file_name)

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(input_file_path)

# Apply the preprocess_text function to the 'text' column after replacing missing values with an empty string
df['processed_text'] = df['text'].fillna('').apply(preprocess_text)

# Save the updated DataFrame to a new CSV file in the same folder
df.to_csv(output_file_path, index=False)

print(f"Preprocessing completed. The processed data has been saved to '{output_file_path}'.")


Preprocessing completed. The processed data has been saved to 'C:\Users\sahma\Desktop\Thises\Stocks\stocks\NVDA\NVDA_4\processed_Nvidia_4.csv'.


In [3]:
import pandas as pd
df = pd.read_csv(r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\NVDA\NVDA_4\processed_Nvidia_4.csv')


In [5]:
df.head()

Unnamed: 0,id,text,time,sentiment,processed_text
0,440095047,Sweep Options Activity: $NVDA is the #18 ticke...,2022-02-28,,sweep options activity is the 18 ticker with s...
1,440090156,$NVDA I’m so bullish,2022-02-28,Bullish,i am so bullish
2,440090065,$NVDA boys I’m so bullish,2022-02-28,Bullish,boys i am so bullish
3,440086959,$PLUG $tsla $arkk $f $NVDA \n \nIf Putin will...,2022-02-28,,if putin will kill his own brothers then what ...
4,440086726,$NVDA give me a 10% day tmr pls 👀,2022-02-28,Bullish,give me a 10 day tomorrow pls


In [6]:
df.tail()

Unnamed: 0,id,text,time,sentiment,processed_text
102323,496644313,$NVDA amazing this went green at all with reve...,2022-11-16,Bearish,amazing this went green at all with revenues d...
102324,496629362,$NVDA 🤣🤣🤣 Nothing burger???,2022-11-16,,nothing burger
102325,496644495,$NVDA how is this seriously not down $20?,2022-11-16,,how is this seriously not down 20
102326,496644485,$NVDA sell before 162.00 breakdown,2022-11-16,,sell before 16200 breakdown
102327,496644269,$NVDA They missed on EPS and reduced guidance....,2022-11-16,Bearish,they missed on eps and reduced guidance gltu n...


In [21]:
import pandas as pd
from tabulate import tabulate
from colorama import Fore, Style


# Read the CSV file into a pandas DataFrame
df = pd.read_csv(r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\NVDA\NVDA_4\processed_Nvidia_4.csv')

# Count the number of None values in the 'processed_text' column
num_none_values = df['processed_text'].isnull().sum()

# Get the total number of values in the 'processed_text' column
total_values = len(df['processed_text'])

# Calculate the percentage of None values
percentage_none_values = (num_none_values / total_values) * 100

# Calculate the percentage of None values
after_removing_none = (total_values - num_none_values)

# Create a table with the information
table = [
    ["Total", total_values],
    ["Number of None", num_none_values],
    ["Percentage of None", f"{percentage_none_values:.2f}%"],
    ["After removing None values", after_removing_none]
]

# Print the table with colored output
print(Fore.BLUE + tabulate(table, headers=["Description (values in column 'processed_text')", " Values"], tablefmt="fancy_grid") + Style.RESET_ALL)




[34m╒═══════════════════════════════════════════════════╤═══════════╕
│ Description (values in column 'processed_text')   │  Values   │
╞═══════════════════════════════════════════════════╪═══════════╡
│ Total                                             │ 102328    │
├───────────────────────────────────────────────────┼───────────┤
│ Number of None                                    │ 4476      │
├───────────────────────────────────────────────────┼───────────┤
│ Percentage of None                                │ 4.37%     │
├───────────────────────────────────────────────────┼───────────┤
│ After removing None values                        │ 97852     │
╘═══════════════════════════════════════════════════╧═══════════╛[0m


## Devide file into multi files equally to reduce the row count for processing

In [49]:
import pandas as pd

# Read the original CSV file
input_file = r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\TSLA\TESLA_4\processed_Tesla_4.csv'
data = pd.read_csv(input_file)

# Split the data into parts
num_rows = len(data)
rows_per_file = num_rows // 7

for i in range(7):
    start_idx = i * rows_per_file
    end_idx = (i + 1) * rows_per_file if i < 6 else num_rows
    subset_data = data.iloc[start_idx:end_idx]
    
    output_file = f'processed_Tesla_4.{i + 1}.csv'
    subset_data.to_csv(output_file, index=False)
    print(f'Saved {output_file}')

print("Files processed and saved successfully.")



Saved processed_Tesla_4.1.csv
Saved processed_Tesla_4.2.csv
Saved processed_Tesla_4.3.csv
Saved processed_Tesla_4.4.csv
Saved processed_Tesla_4.5.csv
Saved processed_Tesla_4.6.csv
Saved processed_Tesla_4.7.csv
Files processed and saved successfully.


In [50]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\TSLA\TESLA_4\processed_Tesla_4.csv')
df.head()
len(df)

678068

In [46]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\TSLA\TESLA_3\processed_Tesla_3.1.csv')
df.head()
len(df)

128138

In [47]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\TSLA\TESLA_3\processed_Tesla_3.7.csv')
df.head()
len(df)

128141

In [17]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\TSLA\TESLA_2\processed_Tesla_2.4.csv')
df.head()

Unnamed: 0,id,text,time,sentiment,processed_text
0,463680229,$TSLA shows a strong growth in Revenue. In the...,02-06-2022,,shows a strong growth in revenue in the last y...
1,463680133,$TSLA this is headed to R2 big resistance folk...,02-06-2022,Bullish,this is headed to r2 big resistance folks 7882
2,463678141,$TSLA will be 840 tomorrow. Mark this,02-06-2022,Bullish,will be 840 tomorrow mark this
3,463680065,$TSLA reverse h/s pattern on the daily is a me...,02-06-2022,,reverse hs pattern on the daily is a measured ...
4,463679574,$TSLA got in on a whim in the 600s last week a...,02-06-2022,,got in on a whim in the 600s last week after t...


In [18]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\TSLA\TESLA_2\processed_Tesla_2.5.csv')
df.head()

Unnamed: 0,id,text,time,sentiment,processed_text
0,475147672,$TSLA don’t buy at the very top!! Use your bra...,01-08-2022,Bearish,do not buy at the very top use your brain tesl...
1,475147569,$TSLA squeeze inbound. $930 today $970 tmmrw g...,01-08-2022,Bullish,squeeze inbound 930 today 970 tmmrw go night n...
2,475147499,$TSLA sold 100%,01-08-2022,Bearish,sold 100
3,475147459,$TSLA this is a call option pump and will rall...,01-08-2022,,this is a call option pump and will rally into...
4,475147451,$TSLA 🤮🤮🤮,01-08-2022,,


In [32]:
df = pd.read_csv(r'C:\Users\sahma\Desktop\Thises\Stocks\stocks\TSLA\TESLA_2\finbert_processed_Tesla_2.3.csv')

In [33]:
df.head()

Unnamed: 0,id,text,time,sentiment,processed_text,finbert_sentiment,finbert_score
0,451610840,@MostlyBullish95 running around talking trash ...,12-04-2022,Bullish,mostlybullish95 running around talking trash a...,Positive,0.999563
1,451610854,"$TSLA I gotta say I&#39;m getting a kick, watc...",12-04-2022,,i got to say i39m getting a kick watching your...,Positive,0.999986
2,451610959,$TSLA markets expecting the worse from cpi dat...,12-04-2022,,markets expecting the worse from cpi data so m...,Negative,0.990494
3,451610963,$TSLA wait till y’all find out Tesla couldn’t ...,12-04-2022,,wait till you all find out tesla could not mak...,Neutral,0.994407
4,451607823,$TSLA LFG,12-04-2022,Bullish,lfg,Neutral,0.998997


In [34]:
df.tail()

Unnamed: 0,id,text,time,sentiment,processed_text,finbert_sentiment,finbert_score
108474,463678210,VIDEO: Broad Market Technical Analysis Chart 6...,02-06-2022,,video broad market technical analysis chart 62...,Neutral,0.999999
108475,463681035,$ABML anyone here talking/speculating about $T...,02-06-2022,Bullish,anyone here talkingspeculating about battery f...,Negative,0.872641
108476,463680753,$TSLA poe smoked and lonely,02-06-2022,,poe smoked and lonely,Neutral,0.995271
108477,463680536,$TSLA nice 1% move AH haven’t seen this streng...,02-06-2022,Bullish,nice 1 move ah have not seen this strength in ...,Negative,0.82982
108478,463680390,$TSLA I feel like fucking Neo,02-06-2022,Bullish,i feel like fucking neo,Neutral,0.820125
