<a href="https://colab.research.google.com/github/DDave94/Stock-Prediction-DL/blob/main/ExtractingTwitterSentiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter Data Cleansing and Sentiment Analysis
##### *Step 1: Cleaning Date Column. Creating Date column with YYYY-MM-DD format*
##### *Step 2: Cleaning Tweet Content. Removing special characters, mentions, hastags, links, and other special characters from tweets*
##### *Step 3: Sentiment Calculations. Calculating compound sentiment score for each day*
##### *Step 4: Aggregating sentiment scores for each day (mean average) to obtain overall sentiment for each day*
##### *Step 5: Creating final Twitter dataframe with data from all years of tweets and sentiment scores*


In [None]:
#Installing and importing important libs
!pip install flair
import pandas as pd
import re
from datetime import datetime
import flair

pd.set_option("display.max_colwidth" , 100)

In [None]:
#Loading raw twitter data containing Date and Tweets pertaining to Microsoft Stock
df_2017 = pd.read_csv("https://raw.githubusercontent.com/DDave94/Stock-Prediction-DL/main/datasets/raw/text-query-tweets-2017.csv")
df_2018 = pd.read_csv("https://raw.githubusercontent.com/DDave94/Stock-Prediction-DL/main/datasets/raw/text-query-tweets-2018.csv")
df_2019 = pd.read_csv("https://raw.githubusercontent.com/DDave94/Stock-Prediction-DL/main/datasets/raw/text-query-tweets-2019.csv")
df_2020 = pd.read_csv("https://raw.githubusercontent.com/DDave94/Stock-Prediction-DL/main/datasets/raw/text-query-tweets-2020.csv")

# Step 1: Cleaning Dates
###### *Creating Date column by removing time component, since we want to aggregrate sentiment scores on a daily basis*

In [None]:
def date_generation(date):
    # Create date object from given time format in dataframe
    my_date = datetime.strptime(date, "%Y-%m-%d %H:%M:%S+00:00")
    return my_date.date()

def date_cleanse(input_df):
    input_df['Date'] = input_df['Datetime'].apply(lambda x: date_generation(x))
    return input_df

# Creating clean date columns
df_2017 = date_cleanse(df_2017)
df_2018 = date_cleanse(df_2018)
df_2019 = date_cleanse(df_2019)
df_2020 = date_cleanse(df_2020)

# Step 2: Cleaning Tweet content
###### *Removing links, mentions, hashtags etc.*

In [None]:
def tweet_cleanse(text):

    #Removing hyperlinks with text
    text = re.sub(r'https?:\/\/\S+','', text) 

    #Removing $ and any text appearing after
    text = re.sub(r'\$[A-za-z0-9]+','', text) 

    #Removing pattern "Read More: and MSFT tokens"
    text = re.sub(r'Read more:|MSFT','', text) 

    #Removing @mentions
    text = re.sub(r'@[A-Za-z0-9]+', '', text) 

    #Removing = sybmol and text coming after
    text = re.sub(r'=[\S\D\s]+', '', text)
    
    #Removing all other special characters
    text = re.sub('[^A-Za-z0-9?!\']+', ' ', text)

    return text

def gen_tweets(input_df): 
    input_df['Tweet'] = input_df['Text'].apply(lambda x: tweet_cleanse(x))
    return input_df

#Generating clean tweets from current text
df_2017 = gen_tweets(df_2017)
df_2018 = gen_tweets(df_2018)
df_2019 = gen_tweets(df_2019)
df_2020 = gen_tweets(df_2020)

# Step 3: Tweet Sentiment Calculations
###### *Generating the compound sentiment score for each tweet using distilBert sentiment classifier from Flair*

In [None]:
#Creating sentiment model from flair
sentiment_model = flair.models.TextClassifier.load('en-sentiment')

def get_sentiment(tweet): 
    #Tokenizing the sentence
    tweet_tokenized = flair.data.Sentence(tweet) 
    #Making prediction on tokenized sentence and extracting labels
    sentiment_model.predict(tweet_tokenized)

    #Extracting the sentiment confidence score
    if (tweet_tokenized.labels[0].value == 'NEGATIVE'):
        sentiment_score = -1 * (tweet_tokenized.labels[0].score)
    else:
        sentiment_score = tweet_tokenized.labels[0].score
        
    return sentiment_score

#Creates a sentiment column for each tweet
def sentiment_generation(input_df): 
    input_df['TwitterSentiment'] = input_df['Tweet'].apply(get_sentiment)
    sentiment_df = input_df
    return sentiment_df 

df_2017 = sentiment_generation(df_2017)
df_2018 = sentiment_generation(df_2018)
df_2019 = sentiment_generation(df_2019)
df_2020 = sentiment_generation(df_2020)

print(df_2017[['Tweet', 'TwitterSentiment']].head(10))


2021-03-20 02:53:02,664 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt
                                                                                          Tweet  TwitterSentiment
0         Lumia 550 and 650 discounted at online Microsoft Store if you can find them in stock           0.768637
1                                                            Microsoft Dividend Stock Analysis           0.574826
2  Tesco still has Microsoft Lumia devices in stock as well as its approved partners in the UK           0.954907
3                                              Why is the Microsoft Store pruning Lumia stock?          -0.999738
4                                              Why is the Microsoft Store pruning Lumia stock?          -0.999738
5                                  3 Strengths That Make Microsoft Corporation Stock A Buy Now           0.998628
6                                  3 Strengths That Make Microsoft Corporation Stock A Buy Now           0.9986

# Step 4: Sentiment Score Aggregation
###### *Aggregating the final sentiment scores to gather scores for each day*

In [None]:
#Creates an aggregated sentiment score for each day of tweets
def aggregated_df (input_df):
    agg_df = input_df[['Date', 'TwitterSentiment']]
    agg_df =  input_df.groupby(["Date"], as_index=False)['TwitterSentiment'].mean()
    return agg_df

final_2017 = aggregated_df(df_2017)
final_2018 = aggregated_df(df_2018)
final_2019 = aggregated_df(df_2019)
final_2020 = aggregated_df(df_2020)

# Step 5: Creating full input dataframe for further analysis and stock price predictions
###### *Combining twitter data from 2017,2018,2019,2020*

In [None]:
#Creating full twitter data
twitter_data = final_2017.append([final_2018, final_2019, final_2020])
print(twitter_data.shape)
print(twitter_data.head())
print(twitter_data.tail())

twitter_data.to_csv('/content/twitter_sentiment_data.csv', index=False, encoding= 'utf-8-sig') 

(1340, 2)
         Date  TwitterSentiment
0  2017-01-01          0.768637
1  2017-01-02         -0.117436
2  2017-01-03          0.499253
3  2017-01-04         -0.995186
4  2017-01-05          0.318698
           Date  TwitterSentiment
359  2020-12-25          0.016382
360  2020-12-26          0.979240
361  2020-12-27          0.408995
362  2020-12-28          0.544929
363  2020-12-29         -0.373856
