In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 100)

In [2]:
# Load market data
market_train = pd.read_csv("market_train_df.csv") 

In [3]:
# Load news data file 1
news_train = pd.read_csv("news_train_df_1.csv") 

In [None]:
# Examine Apple only
#df_market_apple = df_market.loc[df_market['assetName'] == "Apple Inc"]
#df_news_apple = df_news.loc[df_news['assetName'] == "Apple Inc"]
#df_news_apple['subjects']
#df_news_apple['audiences']

In [4]:
## Convert non-trading date to the next trading date
# Convert all the 'time' to datetime format in news_train and market_train
news_train['time'] = pd.to_datetime(news_train['time']).apply(lambda x: x.date())
market_train['time'] = pd.to_datetime(market_train['time']).apply(lambda x: x.date())

In [5]:
# Get all the unique dates from news data and market data
time_news = pd.DataFrame(news_train['time'].unique(), columns={'time'})
time_market = pd.DataFrame(market_train['time'].unique(), columns={'time'})

In [6]:
#Keep a copy of news date and market date before merging
#time_news['time_n'] = time_news['time']
time_market['time_m'] = time_market['time']

In [7]:
# Merge the two dataframes,the merged df should have the same length with time_news
df= pd.merge(left=time_market, right= time_news, how='right', on=['time'])
assert len(df) == len(time_news)

In [8]:
# sort the date to be in ascending order
df= df.sort_values(by=['time'])

In [9]:
# Delete the rows before 2007-02-01 as market data start from this date, and it is a trading date
cutoff = pd.to_datetime("2007-02-01").date()
df = df[~(df['time'] < cutoff)]

In [10]:
df.head()

Unnamed: 0,time,time_m
0,2007-02-01,2007-02-01
1,2007-02-02,2007-02-02
353,2007-02-03,
354,2007-02-04,
2,2007-02-05,2007-02-05


In [None]:
#df= df.set_index(df['date'])

In [11]:
# Backward fill the NaN's in date_m column to make non-trading dates the next trading date
df_adjusted = df.fillna(method='bfill')

In [12]:
df_adjusted.head()

Unnamed: 0,time,time_m
0,2007-02-01,2007-02-01
1,2007-02-02,2007-02-02
353,2007-02-03,2007-02-05
354,2007-02-04,2007-02-05
2,2007-02-05,2007-02-05


In [13]:
# Merge the df_adjusted back to news data on time
# Note that for the first few rows of news, the time_m is NaN, this is fine as we don't
# corresponding time in the market data. 
# !!!!Can double check this after merging with market data !!!
news_train_adjusted= pd.merge(left=news_train, right= df_adjusted, how='left', on=['time'])
assert len(news_train_adjusted)==len(news_train)

In [18]:
news_train_adjusted.head()

Unnamed: 0,time,sourceTimestamp,firstCreated,sourceId,headline,urgency,takeSequence,provider,subjects,audiences,bodySize,companyCount,headlineTag,marketCommentary,sentenceCount,wordCount,assetCodes,assetName,firstMentionSentence,relevance,sentimentClass,sentimentNegative,sentimentNeutral,sentimentPositive,sentimentWordCount,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D,time_m
0,2007-01-01,2007-01-01 04:29:32+00:00,2007-01-01 04:29:32+00:00,e58c6279551b85cf,China's Daqing pumps 43.41 mln tonnes of oil i...,3,1,RTRS,"{'ENR', 'ASIA', 'CN', 'NGS', 'EMRG', 'RTRS', '...","{'Z', 'O', 'OIL'}",1438,1,,False,11,275,"{'0857.HK', '0857.F', '0857.DE', 'PTR.N'}",PetroChina Co Ltd,6,0.235702,-1,0.500739,0.419327,0.079934,73,0,0,0,0,0,0,0,3,6,7,
1,2007-01-01,2007-01-01 07:03:34+00:00,2007-01-01 07:03:34+00:00,5a31c4327427f63f,"FEATURE-In kidnapping, finesse works best",3,1,RTRS,"{'FEA', 'CA', 'LATAM', 'MX', 'INS', 'ASIA', 'I...","{'PGE', 'PCO', 'G', 'ESN', 'MD', 'PCU', 'DNP',...",4413,1,FEATURE,False,55,907,{'STA.N'},Travelers Companies Inc,8,0.447214,-1,0.600082,0.345853,0.054064,62,1,1,1,1,1,1,1,3,3,3,
2,2007-01-01,2007-01-01 11:29:56+00:00,2007-01-01 11:29:56+00:00,1cefd27a40fabdfe,PRESS DIGEST - Wall Street Journal - Jan 1,3,1,RTRS,"{'RET', 'ENR', 'ID', 'BG', 'US', 'PRESS', 'IQ'...","{'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD...",2108,2,PRESS DIGEST,False,15,388,"{'WMT.DE', 'WMT.N'}",Wal-Mart Stores Inc,14,0.377964,-1,0.450049,0.295671,0.25428,67,0,0,0,0,0,0,0,5,11,17,
3,2007-01-01,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,23768af19dc69992,PRESS DIGEST - New York Times - Jan 1,3,1,RTRS,"{'FUND', 'FIN', 'CA', 'SFWR', 'INS', 'PUB', 'B...","{'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD...",1776,6,PRESS DIGEST,False,14,325,"{'GOOG.O', 'GOOG.OQ', 'GOOGa.DE'}",Google Inc,13,0.149071,-1,0.752917,0.162715,0.084368,83,0,0,0,0,0,0,0,5,13,15,
4,2007-01-01,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,23768af19dc69992,PRESS DIGEST - New York Times - Jan 1,3,1,RTRS,"{'FUND', 'FIN', 'CA', 'SFWR', 'INS', 'PUB', 'B...","{'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD...",1776,6,PRESS DIGEST,False,14,325,{'XMSR.O'},XM Satellite Radio Holdings Inc,11,0.149071,-1,0.699274,0.20936,0.091367,102,0,0,0,0,0,0,0,0,0,0,


In [17]:
# Use time_m as merging key for further manipulation