# Sentiment Analysis

In [1]:
#import dependancies
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import json

In [2]:
#Read in csv files

nyse_price_csv = Path("Resources/nyse_prices-split-adjusted.csv")

nyse_sec_csv = Path("Resources/nyse_securities.csv")

spx_csv = Path("Resources/SPX.csv")

#create df's
nyse_price_df = pd.read_csv(nyse_price_csv)
nyse_sec_df = pd.read_csv(nyse_sec_csv)
spx_df = pd.read_csv(spx_csv)


In [3]:
#Preview NYSE price data
nyse_price_df.head()

Unnamed: 0,date,symbol,open,close,low,high,volume
0,2016-01-05,WLTW,123.43,125.839996,122.309998,126.25,2163600.0
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0
2,2016-01-07,WLTW,116.379997,114.949997,114.93,119.739998,2489500.0
3,2016-01-08,WLTW,115.480003,116.620003,113.5,117.440002,2006300.0
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0


In [4]:
#Preview NYSE securities data
nyse_sec_df.head()

Unnamed: 0,Ticker symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,Address of Headquarters,Date first added,CIK
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",,66740
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152
3,ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373
4,ATVI,Activision Blizzard,reports,Information Technology,Home Entertainment Software,"Santa Monica, California",2015-08-31,718877


In [5]:
#Preview NYSE SP500 data
spx_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1927-12-30,17.66,17.66,17.66,17.66,17.66,0
1,1928-01-03,17.76,17.76,17.76,17.76,17.76,0
2,1928-01-04,17.719999,17.719999,17.719999,17.719999,17.719999,0
3,1928-01-05,17.549999,17.549999,17.549999,17.549999,17.549999,0
4,1928-01-06,17.66,17.66,17.66,17.66,17.66,0


# Data Cleaning

In [17]:
# Lets look for null values in the nyse price dataset
nyse_price_df.isna().sum()
#Lets change symbol column to Ticker Symbol for merging
nyse_price_df["Ticker symbol"] = nyse_price_df["symbol"]
# drop old symbol column
nyse_price_df= nyse_price_df.drop(["symbol"],axis = 1)
#set index to ticker symbol
#nyse_price_df = nyse_price_df.set_index(nyse_price_df["Ticker symbol"])
nyse_price_df.head()

Unnamed: 0,date,open,close,low,high,volume,Ticker symbol
0,2016-01-05,123.43,125.839996,122.309998,126.25,2163600.0,WLTW
1,2016-01-06,125.239998,119.980003,119.940002,125.540001,2386400.0,WLTW
2,2016-01-07,116.379997,114.949997,114.93,119.739998,2489500.0,WLTW
3,2016-01-08,115.480003,116.620003,113.5,117.440002,2006300.0,WLTW
4,2016-01-11,117.010002,114.970001,114.089996,117.330002,1408600.0,WLTW


In [18]:
# Lets look for null values in the nyse securities dataset
nyse_sec_df.isna().sum() #198 na values in the date first added column

#drop Date first added and Address of Headquarters columns because we won't be needing them
nyse_sec_clean = nyse_sec_df.drop(["Date first added","Address of Headquarters"],axis = 1)
#Set index as Ticker symbol
#nyse_sec_clean = nyse_sec_clean.set_index(nyse_sec_clean["Ticker symbol"])
nyse_sec_clean.head()


Unnamed: 0,Ticker symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,CIK
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,66740
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,1800
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,1551152
3,ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,1467373
4,ATVI,Activision Blizzard,reports,Information Technology,Home Entertainment Software,718877


In [19]:
# Lets look for null values in the SP500 dataset
spx_df.isna().sum() ## all good no null values

spx_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1927-12-30,17.66,17.66,17.66,17.66,17.66,0
1,1928-01-03,17.76,17.76,17.76,17.76,17.76,0
2,1928-01-04,17.719999,17.719999,17.719999,17.719999,17.719999,0
3,1928-01-05,17.549999,17.549999,17.549999,17.549999,17.549999,0
4,1928-01-06,17.66,17.66,17.66,17.66,17.66,0


In [25]:
# Lets merge the NYSE data sets together based on ticker symbol
combined_df = nyse_price_df.merge(nyse_sec_clean, on = "Ticker symbol",how ="inner")

combined_df = combined_df.set_index("Ticker symbol")

combined_df.head()

Unnamed: 0_level_0,date,open,close,low,high,volume,Security,SEC filings,GICS Sector,GICS Sub Industry,CIK
Ticker symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
WLTW,2016-01-05,123.43,125.839996,122.309998,126.25,2163600.0,Willis Towers Watson,reports,Financials,Insurance Brokers,1140536
WLTW,2016-01-06,125.239998,119.980003,119.940002,125.540001,2386400.0,Willis Towers Watson,reports,Financials,Insurance Brokers,1140536
WLTW,2016-01-07,116.379997,114.949997,114.93,119.739998,2489500.0,Willis Towers Watson,reports,Financials,Insurance Brokers,1140536
WLTW,2016-01-08,115.480003,116.620003,113.5,117.440002,2006300.0,Willis Towers Watson,reports,Financials,Insurance Brokers,1140536
WLTW,2016-01-11,117.010002,114.970001,114.089996,117.330002,1408600.0,Willis Towers Watson,reports,Financials,Insurance Brokers,1140536
