### Process Twitter Dataset

In [1]:
import pandas as pd
import numpy as np

import preprocessor as preprocessor_model
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
def restrict_data(file_path, number_of_files):
    dfs = []
    for i in range(0,number_of_files,1):
        fp = file_path + str(i) + '.csv'
        print('Processing {}'.format(fp))
        tweet_details_df = pd.read_csv(fp)
        dates = tweet_details_df.date.tolist()
        dates = map(lambda x: x.split(' ')[0], dates)
        tweets = tweet_details_df.tweet.tolist()
        df = pd.DataFrame({'Date': dates,
                           'Tweet':tweets})
        dfs.append(df)
    return dfs

In [3]:
def concatenate_dfs(dfs):
    dates = []
    tweets = []
    for df in dfs:
        print("Adding data for {}".format(df.Date.tolist()[0].split('-')[0]))
        dates.extend(df.Date.tolist())
        tweets.extend(df.Tweet.tolist())
    df = pd.DataFrame({'Date': dates,
                           'Tweet':tweets})
    return df

In [4]:
def get_sentiment(tweet):
    sentiment_model = SentimentIntensityAnalyzer()
    score = sentiment_model.polarity_scores(tweet)['compound']
    return score

def get_cleaned_text(text):
    cleaned_text = preprocessor_model.clean(text)
    return cleaned_text



In [5]:
def interpolate_sentiment(stock_df, sentiment_df):
    
    stock_dates = stock_df.Date.tolist()
    sentiment_dates = sentiment_df.Date.tolist()
    sentiment_scores = sentiment_df['Sentiment Score'].tolist()
    
    sentiment_dict = dict(zip(sentiment_dates, sentiment_scores))
    
    stock_sentiment_scores = []
    for i in range(len(stock_dates)):
        if stock_dates[i] in sentiment_dict:
            stock_sentiment_scores.append(sentiment_dict[stock_dates[i]])
        else:
            stock_sentiment_scores.append(np.nan)

    complete_sentiment_df = pd.DataFrame({'Date':stock_dates,
                                          'Sentiment Score':stock_sentiment_scores})

    complete_sentiment_df.interpolate(method ='linear', limit_direction ='forward', inplace=True)
    complete_sentiment_df.interpolate(method ='linear', limit_direction ='backward', inplace=True)
    
    return complete_sentiment_df

In [6]:
def create_dataset(stocks_path, tweets_path, number_of_files, save_path):
    dfs = restrict_data(tweets_path, number_of_files)

    concatenated_df = concatenate_dfs(dfs)

    concatenated_df['Cleaned Tweet'] = list(map(get_cleaned_text, concatenated_df['Tweet'].tolist()))
    concatenated_df['Sentiment Score'] = list(map(get_sentiment, concatenated_df['Cleaned Tweet'].tolist()))

    sentiment_df = concatenated_df[['Date', 'Sentiment Score']]
    sentiment_df = sentiment_df.groupby(['Date'], as_index=False).mean()
    
    stock_df = pd.read_csv(stocks_path)

    print('Interpolating Sentiment Scores')
    complete_sentiment_df = interpolate_sentiment(stock_df, sentiment_df)

    stock_sentiment_df = stock_df
    stock_sentiment_df["Sentiment Score"] = complete_sentiment_df["Sentiment Score"]
    
    print("Sample of Data - \n\n{}\n\n".format(stock_sentiment_df.head()))
    
    print('Writing Data to CSV - {}'.format(save_path))
    stock_sentiment_df.to_csv(save_path, encoding='utf-8', index=False)
    

In [7]:
stocks_path = "data/processed_dataset/Bound_TSLA.csv"
tweets_path = 'data/raw/Elon_Musk_tweets/201'
number_of_files = 10
save_path = "data/processed_dataset/dataset_p1.csv"


In [8]:
create_dataset(stocks_path, tweets_path, number_of_files, save_path)

Processing data/raw/Elon_Musk_tweets/2010.csv
Processing data/raw/Elon_Musk_tweets/2011.csv
Processing data/raw/Elon_Musk_tweets/2012.csv
Processing data/raw/Elon_Musk_tweets/2013.csv
Processing data/raw/Elon_Musk_tweets/2014.csv
Processing data/raw/Elon_Musk_tweets/2015.csv
Processing data/raw/Elon_Musk_tweets/2016.csv
Processing data/raw/Elon_Musk_tweets/2017.csv
Processing data/raw/Elon_Musk_tweets/2018.csv
Processing data/raw/Elon_Musk_tweets/2019.csv
Adding data for 2010
Adding data for 2011
Adding data for 2012
Adding data for 2013
Adding data for 2014
Adding data for 2015
Adding data for 2016
Adding data for 2017
Adding data for 2018
Adding data for 2019
Interpolating Sentiment Scores
Sample of Data - 

         Date       Open   High        Low      Close  Adj Close    Volume  \
0  2010-06-29  19.000000  25.00  17.540001  23.889999  23.889999  18766300   
1  2010-06-30  25.790001  30.42  23.299999  23.830000  23.830000  17187100   
2  2010-07-01  25.000000  25.92  20.270000  21