In [117]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

In [186]:
def extract_by_value(data, column, value):
    """ Return the data with given value in the specified column
    """
    if not column in data.columns:
        raise ValueError('The column name is incorrect!')
    return data[data[column] == value]

In [216]:
def extract_asset_name_and_codes(data):
    """ Return a DataFrame including all assetName to assetCode correspondences from the input data
    """
    assetCode = "assetCode"
    if "assetCodes" in data.columns.tolist():
        assetCode = "assetCodes"
    subdata = data[['assetName', assetCode]]
    # Group by assetName
    return subdata.groupby('assetName')[assetCode].apply(set).reset_index()

In [188]:
def prepare_data(market_train, news_train):
    """ Prepare the combined data by merging market_train and news_train on "time" and "assetName"
    """
    ## Pre-process of market_train
    # Convert "time" to datetime format (Note: Currently, we only keep the time to date)
    market_train['time'] = pd.to_datetime(market_train['time']).apply(lambda x: x.date())
    
    ## Pre-process of news_train
    # Convert "time" to datetime format (Note: Currently, we only keep the time to date)
    news_train['time'] = pd.to_datetime(news_train['time']).apply(lambda x: x.date())
    # Get rid of some columns in news data (the list of dropped columns can be modified)
    drop_list = ['sourceTimestamp','firstCreated','sourceId','headline']
    news_train.drop(drop_list, axis=1, inplace=False)
    # Group news_train by "time" and "assetName" and then compute mean on each group
    news_train_grouped = news_train.groupby(['time','assetName'], sort=False).aggregate(np.mean).reset_index()
    
    # Merge two DataFrames
    return pd.merge(left=market_train, right= news_train_grouped, how='left', on=['time', 'assetName'], copy=False)

In [189]:
# Load market and news training data
market_train = pd.read_csv("./market_train_df.csv")
news_train = pd.read_csv("./news_train_df_1.csv")

In [None]:
# Show the assetName and assetCode correspondences in market_train
extract_asset_name_and_codes(market_train)

In [190]:
# Get merged DataFrame
merged_data = prepare_data(market_train, news_train)

In [217]:
# Extract all records for 'Apple Inc' from the merged table
extract_by_value(merged_data, 'assetName', 'Apple Inc')

Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,...,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D
3,2007-02-01,AAPL.O,Apple Inc,23747329.0,84.7400,86.230,-0.011548,0.016324,,,...,,,,,,,,,,
1421,2007-02-02,AAPL.O,Apple Inc,22212416.0,84.7500,84.120,0.000118,-0.024469,-0.001091,-0.026983,...,1.687500,1.687500,1.687500,1.687500,1.687500,9.187500,10.437500,29.000000,44.375000,45.437500
2844,2007-02-05,AAPL.O,Apple Inc,20737742.0,83.9400,84.300,-0.009558,0.002140,-0.008843,0.001477,...,1.333333,1.333333,1.333333,1.444444,1.444444,4.777778,4.777778,8.666667,26.777778,51.222222
4272,2007-02-06,AAPL.O,Apple Inc,30893975.0,84.1500,84.450,0.002502,0.001779,0.002006,0.002135,...,0.363636,0.363636,0.363636,0.363636,0.454545,5.909091,9.363636,18.181818,37.818182,51.636364
5702,2007-02-07,AAPL.O,Apple Inc,38180972.0,86.1500,84.495,0.023767,0.000533,0.022779,0.000392,...,0.538462,0.923077,1.076923,1.076923,1.076923,11.846154,20.076923,32.846154,44.692308,55.769231
7134,2007-02-08,AAPL.O,Apple Inc,24264750.0,86.1800,85.430,0.000348,0.011066,0.001591,0.010226,...,1.000000,1.000000,1.000000,1.000000,1.000000,5.750000,10.500000,40.875000,45.750000,65.750000
8562,2007-02-09,AAPL.O,Apple Inc,30756328.0,83.2700,85.880,-0.033767,0.005267,-0.024615,0.006735,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.000000,12.500000,41.500000,52.500000,71.500000
9994,2007-02-12,AAPL.O,Apple Inc,25873256.0,84.8800,84.440,0.019335,-0.016768,0.023276,-0.009432,...,1.272727,1.272727,1.272727,1.272727,1.363636,16.454545,25.000000,25.181818,43.424242,73.212121
11428,2007-02-13,AAPL.O,Apple Inc,21250545.0,84.6300,85.160,-0.002945,0.008527,-0.012278,0.012093,...,0.461538,0.461538,0.461538,0.461538,0.461538,8.846154,31.538462,53.307692,60.153846,94.153846
12860,2007-02-14,AAPL.O,Apple Inc,18150679.0,85.3000,84.630,0.007917,-0.006224,-0.001869,-0.012470,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.500000,61.500000,61.500000,76.500000
