## Feature Engineering

#### import data

In [135]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [136]:
df = pd.read_csv('../merged_df/merged_df.csv', sep=",", index_col=0)

Only keep lines that mention the stock 

In [137]:
df['Stock'].value_counts()


Stock
MSFT    4233
GOOG    4233
TSLA    4233
AAPL    4233
AMZN    4233
Name: count, dtype: int64

Only keep rows that mention microsoft, since it mentioned the most

In [138]:
df = df[df['Stock'] == 'MSFT']

In [139]:
df.head()

Unnamed: 0,url,title,date,author,description,text,Article Length,sentiment,sentiment_label,Stock,Open,Low,Close,Adjusted_Close,Volume,Month,Price_Change_Pct,Adjusted_Close_Change
0,https://www.cnbc.com/2015/02/24/apples-record-...,Apple’s record rally depends on Obama?,2015-02-24,John Melloy,The size of Apple's capital return program wil...,In order for Apple to extend its 21 percent ra...,499.0,1.0,very negative,MSFT,44.299999,43.919998,44.09,38.414326,25271700.0,2.0,-0.135903,-0.001359
5,https://www.cnbc.com/2015/02/25/dockers-rise-f...,Docker's rise from sleeper to open source king,2015-02-25,Ari Levy,Docker spent 2014 partnering with the world's ...,"In December 2013, Google announced that its c...",777.0,1.0,very negative,MSFT,44.09,43.799999,43.990002,38.327202,29759800.0,2.0,-0.226805,-0.002268
13,https://www.cnbc.com/2015/03/02/apple-value-to...,Apple value to hit $1 trillion in 1 year,2015-03-02,John Melloy,Apple will rise to $1 trillion in market value...,Apple will rise to $1 trillion in market value...,592.0,2.0,negative,MSFT,44.189999,43.549999,43.880001,38.231373,31924000.0,3.0,0.068421,0.000684
17,https://www.cnbc.com/2015/03/05/is-apple-world...,"Is Apple, world’s largest stock, ‘underowned’?",2015-03-06,John Melloy,A top Wall Street strategist makes the case th...,"Jason Trennert, a top Wall Street strategist, ...",1112.0,4.0,positive,MSFT,43.110001,42.150002,42.360001,36.907036,36248800.0,3.0,-1.739736,-0.017397
24,https://www.cnbc.com/2015/03/09/apple-may-sell...,Apple may sell 1 billion 'life-saving' watches,2015-03-09,John Melloy,CNBC Pro Preview: Influential technology analy...,The following is a free preview of the content...,506.0,1.0,very negative,MSFT,43.130001,42.189999,42.849998,37.333958,32108000.0,3.0,1.156747,0.011567


In [140]:


# Mapping stock names to CEO and company-related keywords
stock_ceo_company_mapping = {

  #"AAPL": ["Tim Cook", "Apple", "iphone", "mac", "AAPL", "iPad", "iMac", "MacBook", "Apple Watch", "Apple TV", "AirPods", "iCloud", "Apple Music", "Apple Pay", "Apple Arcade", "App Store", "Steve Jobs", "Jony Ive"],
  #"AMZN": ["Jeff Bezos", "Amazon", "Jassy", "Prime", "aws", "AMZN", "Amazon Prime", "Kindle", "Echo", "Alexa", "Fire Tablet", "AmazonBasics", "Whole Foods", "Amazon Music", "Audible", "Andy Jassy"],
  #"GOOG": ["Sundar Pichai", "Google", "Alphabet", "Pichai", "Android", "AdSense", "GOOG", "Google Search", "Google Chrome", "Google Pixel", "Google Home", "Google Nest", "YouTube", "Google Drive", "Google Maps", "Google Play", "Google Ads", "Sergey Brin", "Larry Page"],
  "MSFT": ["Satya Nadella", "Nadella", "Azure", "Windows", "Microsoft", "MSFT", "Xbox", "Office", "Surface", "Microsoft Teams", "Microsoft Edge", "OneDrive", "Skype", "LinkedIn", "Outlook", "Bill Gates", "Steve Ballmer"],
  #"TSLA": ["Elon", "Musk", "Tesla", "Model S", "CyberTruck", "TSLA", "Model 3", "Model X", "Model Y", "Roadster", "Powerwall", "Supercharger Network", "Tesla Energy", "JB Straubel"]
}


# Function to check if the text contains mentions of the stock's related keywords
def mentions_stock(text, stock_name):
    """
    Returns True if the text mentions any keywords associated with the stock.
    """
    keywords = stock_ceo_company_mapping.get(stock_name, [])
    return any(keyword.lower() in text.lower() for keyword in keywords)

# Filter rows where the "text" column mentions the stock on the same line
df = df[df.apply(lambda row: mentions_stock(row["text"], row["Stock"]), axis=1)]

# Display the shape of the filtered dataframe and the first few rows
df.shape, df.head()


((1409, 18),
                                                   url  \
 5   https://www.cnbc.com/2015/02/25/dockers-rise-f...   
 13  https://www.cnbc.com/2015/03/02/apple-value-to...   
 17  https://www.cnbc.com/2015/03/05/is-apple-world...   
 24  https://www.cnbc.com/2015/03/09/apple-may-sell...   
 38  https://www.cnbc.com/2015/04/07/the-first-appl...   
 
                                                 title        date  \
 5      Docker's rise from sleeper to open source king  2015-02-25   
 13           Apple value to hit $1 trillion in 1 year  2015-03-02   
 17     Is Apple, world’s largest stock, ‘underowned’?  2015-03-06   
 24     Apple may sell 1 billion 'life-saving' watches  2015-03-09   
 38  Why the first Apple Watch is a yawner for deve...  2015-04-07   
 
          author                                        description  \
 5      Ari Levy  Docker spent 2014 partnering with the world's ...   
 13  John Melloy  Apple will rise to $1 trillion in market value...   
 1

#### 7 day moving average

In [141]:
#  Moving Averages for the Closing Prices of each stock
df['30_day_MA'] = df.groupby('Stock')['Close'].transform(lambda x: x.rolling(window=30).mean())
df['60_day_MA'] = df.groupby('Stock')['Close'].transform(lambda x: x.rolling(window=60).mean())
df['90_day_MA'] = df.groupby('Stock')['Close'].transform(lambda x: x.rolling(window=90).mean())

In [142]:
# Moving Averages
df['SMA_30'] = df.groupby('Stock')['Close'].transform(lambda x: x.rolling(window=30).mean())
df['SMA_60'] = df.groupby('Stock')['Close'].transform(lambda x: x.rolling(window=60).mean())
df['SMA_90'] = df.groupby('Stock')['Close'].transform(lambda x: x.rolling(window=90).mean())

df['EMA_30'] = df.groupby('Stock')['Close'].transform(lambda x: x.ewm(span=30, adjust=False).mean())
df['EMA_60'] = df.groupby('Stock')['Close'].transform(lambda x: x.ewm(span=60, adjust=False).mean())
df['EMA_90'] = df.groupby('Stock')['Close'].transform(lambda x: x.ewm(span=90, adjust=False).mean())

# RSI Calculation
def calculate_rsi(x, periods=14):
    delta = x.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=periods).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=periods).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

df['RSI'] = df.groupby('Stock')['Close'].transform(calculate_rsi)

# MACD Calculation
df['EMA_12'] = df.groupby('Stock')['Close'].transform(lambda x: x.ewm(span=12, adjust=False).mean())
df['EMA_26'] = df.groupby('Stock')['Close'].transform(lambda x: x.ewm(span=26, adjust=False).mean())
df['MACD'] = df['EMA_12'] - df['EMA_26']
df['Signal_Line'] = df.groupby('Stock')['MACD'].transform(lambda x: x.ewm(span=9, adjust=False).mean())
df['MACD_Histogram'] = df['MACD'] - df['Signal_Line']

In [143]:
df.columns

Index(['url', 'title', 'date', 'author', 'description', 'text',
       'Article Length', 'sentiment', 'sentiment_label', 'Stock', 'Open',
       'Low', 'Close', 'Adjusted_Close', 'Volume', 'Month', 'Price_Change_Pct',
       'Adjusted_Close_Change', '30_day_MA', '60_day_MA', '90_day_MA',
       'SMA_30', 'SMA_60', 'SMA_90', 'EMA_30', 'EMA_60', 'EMA_90', 'RSI',
       'EMA_12', 'EMA_26', 'MACD', 'Signal_Line', 'MACD_Histogram'],
      dtype='object')

In [144]:
def nulls_summary_table(df):
    """
    Returns a summary table showing null value counts and percentage

    Parameters:
    df (DataFrame): Dataframe to check

    Returns:
    
    null_values (DataFrame)
    """
    null_values = pd.DataFrame(df.isnull().sum())
    null_values[1] = null_values[0]/len(df)
    null_values.columns = ['null_count','null_pct']
    return null_values

nulls_summary_table(df)

Unnamed: 0,null_count,null_pct
url,0,0.0
title,0,0.0
date,0,0.0
author,0,0.0
description,0,0.0
text,0,0.0
Article Length,0,0.0
sentiment,0,0.0
sentiment_label,0,0.0
Stock,0,0.0


In [145]:
df =df[[ 'date', 'Stock', 'Open', 'Low', 'Close', 'Volume',"30_day_MA","60_day_MA","90_day_MA", '30_day_MA', '60_day_MA', '90_day_MA',
       'SMA_30', 'SMA_60', 'SMA_90', 'EMA_30', 'EMA_60', 'EMA_90', 'RSI', 'EMA_12', 'EMA_26', 'MACD', 'Signal_Line', 'MACD_Histogram','Price_Change_Pct','Article Length', 'sentiment']]

df.rename(columns={'sentiment': 'article_sentiment'}, inplace=True)

df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [146]:
df.head()

Unnamed: 0,date,Stock,Open,Low,Close,Volume,30_day_MA,60_day_MA,90_day_MA,30_day_MA.1,...,EMA_90,RSI,EMA_12,EMA_26,MACD,Signal_Line,MACD_Histogram,Price_Change_Pct,Article Length,article_sentiment
0,2018-07-12,MSFT,104.410004,102.730003,104.190002,24335900.0,95.654,84.081833,72.937778,95.654,...,77.922815,68.726943,99.938967,95.779211,4.159756,4.282549,-0.122794,2.167091,463.0,5.0
1,2018-07-19,MSFT,105.309998,103.889999,104.400002,40171600.0,96.198,84.831333,73.609,96.198,...,78.504731,68.494425,100.62528,96.417788,4.207492,4.267538,-0.060046,-0.684933,668.0,2.0
2,2018-09-10,MSFT,109.639999,108.360001,109.379997,20727900.0,96.780667,85.65,74.336778,96.780667,...,79.183308,74.415092,101.972159,97.377952,4.594207,4.332872,0.261336,1.081229,1787.0,1.0
3,2018-09-11,MSFT,111.589996,108.889999,111.239998,24301800.0,97.428,86.454333,75.102111,97.428,...,79.887851,76.988944,103.397981,98.40477,4.993211,4.464939,0.528271,1.700494,1663.0,1.0
4,2018-09-11,MSFT,111.589996,108.889999,111.239998,24301800.0,98.044667,87.263333,75.862,98.044667,...,80.576909,86.182109,104.604445,99.355528,5.248917,4.621735,0.627182,1.700494,1666.0,4.0


In [147]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               1319 non-null   object 
 1   Stock              1319 non-null   object 
 2   Open               1319 non-null   float64
 3   Low                1319 non-null   float64
 4   Close              1319 non-null   float64
 5   Volume             1319 non-null   float64
 6   30_day_MA          1319 non-null   float64
 7   60_day_MA          1319 non-null   float64
 8   90_day_MA          1319 non-null   float64
 9   30_day_MA          1319 non-null   float64
 10  60_day_MA          1319 non-null   float64
 11  90_day_MA          1319 non-null   float64
 12  SMA_30             1319 non-null   float64
 13  SMA_60             1319 non-null   float64
 14  SMA_90             1319 non-null   float64
 15  EMA_30             1319 non-null   float64
 16  EMA_60             1319 

In [148]:
df.to_csv('data/df_fe.csv')