## Feature Engineering

#### import data

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv('../merged_df/merged_df.csv', sep=",", index_col=0)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40600 entries, 0 to 40599
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   url                    40600 non-null  object 
 1   softTitle              40600 non-null  object 
 2   title                  40600 non-null  object 
 3   date                   40600 non-null  object 
 4   author                 40600 non-null  object 
 5   description            40600 non-null  object 
 6   keywords               40600 non-null  object 
 7   text                   40600 non-null  object 
 8   Article Length         40600 non-null  float64
 9   Stock                  40600 non-null  object 
 10  Open                   40600 non-null  float64
 11  Low                    40600 non-null  float64
 12  Close                  40600 non-null  float64
 13  Adjusted_Close         40600 non-null  float64
 14  Volume                 40600 non-null  float64
 15  Month  

In [9]:
df.head(2)

Unnamed: 0,url,softTitle,title,date,author,description,keywords,text,Article Length,Stock,Open,Low,Close,Adjusted_Close,Volume,Month,Price_Change_Pct,Adjusted_Close_Change
0,https://www.cnbc.com/2015/02/24/nials-rent-but...,Renting your place? Skipping this could cost you,Renting your place? Skipping this could cost you,2015-02-24,Landon Dowdy,Millennials are more likely to rent than to ow...,"Personal finance,business news",Personal FinanceRenting your place? Skipping t...,741.0,MSFT,44.299999,43.919998,44.09,38.414326,25271700.0,2.0,-0.135903,-0.001359
1,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,"Stock markets,business news",In order for Apple to extend its 21 percent ra...,499.0,AMZN,19.024,18.808001,18.929501,18.929501,38416000.0,2.0,-0.407741,-0.004077


#### Only use articles that mention the given ticker and stock market 

In [10]:
import pandas as pd

# only use articles that mention the given ticker and stock market 
keywords = '''
TSLA|Tesla|Elon Musk|Model S|Cybertruck|
AAPL|Apple|Tim Cook|iPhone|Mac|
MSFT|Microsoft|Satya Nadella|Windows|Azure|
GOOG|Google|Sundar Pichai|Android|AdSense|
AMZN|Amazon|Andy Jassy|Prime|AWS|
Shares|Dividend|Earnings|IPO|Buyback|Financial Results|Quarterly Report|SEC Filings|Stock Split|Market Cap|Trading Volume|
Merger|Acquisition|Partnership|Innovation|Patent|Lawsuit|Regulation
'''

# Filter articles based on keywords, case-insensitive
df = df[df['text'].str.contains(keywords, case=False, na=False)]

df.head(), len(df)


(                                                 url  \
 1  https://www.cnbc.com/2015/02/24/apples-record-...   
 2  https://www.cnbc.com/2015/02/24/apples-record-...   
 3  https://www.cnbc.com/2015/02/24/apples-record-...   
 4  https://www.cnbc.com/2015/02/24/apples-record-...   
 5  https://www.cnbc.com/2015/02/24/apples-record-...   
 
                                 softTitle  \
 1  Apple’s record rally depends on Obama?   
 2  Apple’s record rally depends on Obama?   
 3  Apple’s record rally depends on Obama?   
 4  Apple’s record rally depends on Obama?   
 5  Apple’s record rally depends on Obama?   
 
                                     title        date       author  \
 1  Apple’s record rally depends on Obama?  2015-02-24  John Melloy   
 2  Apple’s record rally depends on Obama?  2015-02-24  John Melloy   
 3  Apple’s record rally depends on Obama?  2015-02-24  John Melloy   
 4  Apple’s record rally depends on Obama?  2015-02-24  John Melloy   
 5  Apple’s record rall

In [13]:
df.head(3)

Unnamed: 0,url,softTitle,title,date,author,description,keywords,text,Article Length,Stock,Open,Low,Close,Adjusted_Close,Volume,Month,Price_Change_Pct,Adjusted_Close_Change
1,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,"Stock markets,business news",In order for Apple to extend its 21 percent ra...,499.0,AMZN,19.024,18.808001,18.929501,18.929501,38416000.0,2.0,-0.407741,-0.004077
2,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,"Stock markets,business news",In order for Apple to extend its 21 percent ra...,499.0,AAPL,33.400002,32.7925,33.0425,29.694435,276912400.0,2.0,-0.624062,-0.006241
3,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,"Stock markets,business news",In order for Apple to extend its 21 percent ra...,499.0,TSLA,13.819333,13.446667,13.607333,13.607333,99054000.0,2.0,-1.557832,-0.015578


#### 7 day moving average

In [6]:
df['7_day_avg'] = df['Close'].rolling(window=7).mean()

In [14]:
df.head()

Unnamed: 0,url,softTitle,title,date,author,description,keywords,text,Article Length,Stock,Open,Low,Close,Adjusted_Close,Volume,Month,Price_Change_Pct,Adjusted_Close_Change
1,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,"Stock markets,business news",In order for Apple to extend its 21 percent ra...,499.0,AMZN,19.024,18.808001,18.929501,18.929501,38416000.0,2.0,-0.407741,-0.004077
2,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,"Stock markets,business news",In order for Apple to extend its 21 percent ra...,499.0,AAPL,33.400002,32.7925,33.0425,29.694435,276912400.0,2.0,-0.624062,-0.006241
3,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,"Stock markets,business news",In order for Apple to extend its 21 percent ra...,499.0,TSLA,13.819333,13.446667,13.607333,13.607333,99054000.0,2.0,-1.557832,-0.015578
4,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,"Stock markets,business news",In order for Apple to extend its 21 percent ra...,499.0,GOOG,26.766014,26.340183,26.73111,26.73111,20101036.0,2.0,0.785846,0.007858
5,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,"Stock markets,business news",In order for Apple to extend its 21 percent ra...,499.0,MSFT,44.299999,43.919998,44.09,38.414326,25271700.0,2.0,-0.135903,-0.001359


In [8]:
from textblob import TextBlob
df['sentiment_polarity'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
