# Data Pre Processing

## Read in data

In [1]:
# Packages
import pandas as pd
import numpy as np
import datetime
import re

In [2]:
tesla = pd.read_csv('../Data/Tesla_tweets.csv')
tesla_hash = pd.read_csv('../Data/@Tesla_tweets.csv')

In [3]:
tesla.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3228 entries, 0 to 3227
Data columns (total 3 columns):
id            3228 non-null int64
created_at    3228 non-null object
text          3228 non-null object
dtypes: int64(1), object(2)
memory usage: 75.7+ KB


In [4]:
tesla.head()

Unnamed: 0,id,created_at,text
0,1009896570928521216,2018-06-21 20:31:20,b'@Barraco_Bama accelerator pedal'
1,1009891272440492032,2018-06-21 20:10:16,b'@A_MrNoodle Model X P100D'
2,1009890165907582976,2018-06-21 20:05:53,b'\xe2\x98\x80\xef\xb8\x8f https://t.co/J6mdP7...
3,1009837611932737537,2018-06-21 16:37:03,b'@Polintweet Yes - two Powerwalls will give y...
4,1009616153281323008,2018-06-21 01:57:03,b'@thegreat35t we support the protestors and a...


## Training-test split

## Pre-Processing: Getting Necessary Variables

First we deal with the 'tesla' data. There are several things that we should notice/want to achieve:

1. the "created_at" variable is not a datetime object 

2. We need the year-month-day extracted

3. We need a dummy variable to indicate whether a tweet is extracted from a weekend where stock market is closed.

4. We need avariable to count the number of tweets each day

In [5]:
# Convert 'created_at' to datetime
tesla['created_at'] = tesla['created_at'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))

# Define a function to extract the ymd information
def get_ymd(date_time):
    """
    This function takes in a date_time value and extract the year,month,date value out of it.
    
    ---Parameters
    `date_time` (datetime): the date time value
    
    ---Returns
    a datetime variable in the format of year-month-day
    
    """
    
    import datetime
    import numpy as np
    
    YR,MO,DY = None, None, None
    
    YR = str(date_time.year)
    MO = str(date_time.month)
    DY = str(date_time.day)
    
    if((sum(np.array([YR,MO,DY]) == None)) > 0):
        print('Not sufficient Information. The given date should contain year, month and day')
        return
    
    out = YR + '-' + MO + '-' + DY
    
    return datetime.datetime.strptime(out,'%Y-%m-%d')

# Get ymd
tesla['ymd'] = tesla['created_at'].apply(lambda x: get_ymd(x))

In [6]:
tesla.head()

Unnamed: 0,id,created_at,text,ymd
0,1009896570928521216,2018-06-21 20:31:20,b'@Barraco_Bama accelerator pedal',2018-06-21
1,1009891272440492032,2018-06-21 20:10:16,b'@A_MrNoodle Model X P100D',2018-06-21
2,1009890165907582976,2018-06-21 20:05:53,b'\xe2\x98\x80\xef\xb8\x8f https://t.co/J6mdP7...,2018-06-21
3,1009837611932737537,2018-06-21 16:37:03,b'@Polintweet Yes - two Powerwalls will give y...,2018-06-21
4,1009616153281323008,2018-06-21 01:57:03,b'@thegreat35t we support the protestors and a...,2018-06-21


Next we will calculate the tweet count 

In [7]:
tweet_count = tesla[['ymd','id']].groupby('ymd').nunique()
tweet_count = tweet_count[['id']].reset_index().sort_values('ymd')
tweet_count.columns = ['ymd','count']
tweet_count.head()

Unnamed: 0,ymd,count
0,2014-03-06,2
1,2014-03-07,17
2,2014-03-08,4
3,2014-03-09,1
4,2014-03-10,11


In [8]:
tb1 = pd.merge(tesla,tweet_count,how = 'left',on = 'ymd')
tb1.head()

Unnamed: 0,id,created_at,text,ymd,count
0,1009896570928521216,2018-06-21 20:31:20,b'@Barraco_Bama accelerator pedal',2018-06-21,6
1,1009891272440492032,2018-06-21 20:10:16,b'@A_MrNoodle Model X P100D',2018-06-21,6
2,1009890165907582976,2018-06-21 20:05:53,b'\xe2\x98\x80\xef\xb8\x8f https://t.co/J6mdP7...,2018-06-21,6
3,1009837611932737537,2018-06-21 16:37:03,b'@Polintweet Yes - two Powerwalls will give y...,2018-06-21,6
4,1009616153281323008,2018-06-21 01:57:03,b'@thegreat35t we support the protestors and a...,2018-06-21,6


## Pre-Processing: Extracting Features from the text

In this step, we will extract features from text data. In our case, tweets.

Ultimately, we would like to achieve the following:

1. From the training data, choose terms that are influential (uni-gram + bi-gram). Save them in a set

2. For test data, develop a function that checks if the text contents contains terms found in step 1, one-hot encode them

Assuming that we found p terms in step one, the final training data will have a dimension of n * (p + 2) where we have the closing price for the past two days. In addition

First we start with defining necessary functions

In [11]:
def words_in_text(words, text):
    """This function takes in a list of words and text content. 
    It outputs a pandas Series containing either a 0 or a 1 for each word in the list. 
    The value of the Series should be 0 if the word doesn't appear in the text and 1 if the word does.
    
    ---Args:
        `words` (list of str): words to find
        `text` (str): string to search in
    
    ---Returns:
        Series containing either 0 or 1 for each word in words
        (0 if the word is not in text, 1 if the word is).
    """
    
    import nltk
    
    tokenized = np.array(nltk.word_tokenize(text))
    
    return np.array([(x in tokenized) for x in words]).astype(int)

assert np.allclose(words_in_text(['hello'], 'hello world'),
                   [1])
assert np.allclose(words_in_text(['hello', 'bye', 'world'], 'hello world hello'),
                   [1, 0, 1])

In [15]:
def words_in_texts(words, texts):
    """This function takes in a list of words and a pandas Series of texts. 
    It should output a 2-dimensional NumPy matrix containing one row for each email text.
    
    ---Args:
        `words` (list of str): words to find
        `texts` (Series of str): strings to search in
    
    ---Returns:
        NumPy array of 0s and 1s with shape (n, p) where n is the
        number of texts and p is the number of words.
    """
    import pandas as pd
    if not type(texts) == type(pd.Series()):
        raise TypeError ('texts should be a pandas series')
    
    return np.array([words_in_text(words,x) for x in texts])

# If these don't error, your function outputs the correct output for these examples
assert np.allclose(words_in_texts(['hello', 'bye', 'world'], pd.Series(['hello', 'hello world hello'])),
                   np.array([[1, 0, 0], [1, 0, 1]]))

pandas.core.series.Series