In [3]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import time

In [113]:
def extract_by_value(data, column, value):
    """ Return the data with given value in the specified column
    """
    if not column in data.columns:
        raise ValueError('The column name is incorrect!')
    return data[data[column] == value]

In [114]:
def extract_asset_name_and_codes(data):
    """ Return a DataFrame including all assetName to assetCode correspondences from the input data
    """
    assetCode = "assetCode"
    if "assetCodes" in data.columns.tolist():
        assetCode = "assetCodes"
    subdata = data[['assetName', assetCode]]
    # Group by assetName
    return subdata.groupby('assetName')[assetCode].apply(set).reset_index()

In [125]:
def generate_split_dates(start_year, end_year):
    """ Generate a list of start & end dates for splitting the data (e.g. [('2016-01-01','2016-12-31')])
    """
    split_dates = []
    for year in range(start_year, end_year+1):
        split_dates.append((str(year)+'-01-01', str(year)+'-12-31'))
    return split_dates

In [116]:
def split_data_by_date(data, split_dates):
    """ Split the data by the input list of dates
    data: the input DataFrame
    split_dates: a list of start & end dates for splitting the data (e.g. [('2016-01-01','2016-12-31')])
    """
    data_split = {}
    for start_date, end_date in split_dates:
        condition1 = data['time'] >= pd.to_datetime(start_date).date()
        condition2 = data['time'] <= pd.to_datetime(end_date).date()
        data_split[start_date] = data[condition1 & condition2]
    return data_split

In [152]:
def merge_data(market_train, news_train, start_year=2007, end_year=2016):
    """ Return the combined data by merging market_train and news_train on "time" and "assetCode"
    """
    ## Pre-process of market_train
    # Convert "time" to datetime format (Note: Currently, we only keep the time to date)
    market_train['time'] = pd.to_datetime(market_train['time']).apply(lambda x: x.date())
    print('Convert time to datetime format is done for market data!')
    
    ## Pre-process of news_train
    # Convert "time" to datetime format (Note: Currently, we only keep the time to date)
    news_train['time'] = pd.to_datetime(news_train['time']).apply(lambda x: x.date())
    print('Convert time to datetime format is done for news data!')
    # Get rid of some columns in news data (the list of dropped columns can be modified)
    drop_list = ['sourceTimestamp','firstCreated','sourceId','headline',
                 'takeSequence','provider','firstMentionSentence',
                 'sentenceCount','bodySize','headlineTag','marketCommentary',
                 'subjects','audiences','sentimentClass','urgency',
                 'wordCount','sentimentWordCount']
    news_train.drop(drop_list, axis=1, inplace=True)
    print('Drop columns is done for news data!')
    
    # Split market_train and news_train by year
    split_dates = generate_split_dates(start_year, end_year)
    market_train_split = split_data_by_date(market_train, split_dates)
    del market_train
    print('Split market data is done!')
    news_train_split = split_data_by_date(news_train, split_dates)
    del news_train
    print('Split news data is done!')
    
    # Iterate over split market and news data
    if len(market_train_split.items()) != len(news_train_split.items()):
        raise ValueError('The split train and news data must have the same length!')
    
    merged_data = []
    for start_date, end_date in split_dates:
        print('Merge data from %s to %s ...' % (start_date, end_date))
        market_train_to_process = market_train_split[start_date]
        news_train_to_process = news_train_split[start_date]
        ## Adjust the time of news acquired on non-trading day to the next trading date 
        # Get all the unique dates from news data and market data
        time_market = pd.DataFrame(market_train_to_process['time'].unique(), columns={'time'})
        time_news = pd.DataFrame(news_train_to_process['time'].unique(), columns={'time'})
        # Keep a copy of market date before merging
        time_market['time_market'] = time_market['time']
        # Merge the two dataframes,the merged dataframe should have the same length with time_news
        # Also fill the next trading date
        time_adjusted= pd.merge(left=time_market, right= time_news, how='right', on=['time'], sort=True).fillna(method='bfill')
        # Merge adjusted time to news data
        news_train_adjusted = pd.merge(left=news_train_to_process, right=time_adjusted, how='left', on=['time'], copy=False)
        # Modify 'time_market' as the new 'time' column
        news_train_adjusted.drop(['time'], axis=1, inplace=True)
        news_train_adjusted.rename(columns={'time_market': 'time'}, inplace=True)
        print('Adjust date is done for news data!')
        
        # Split "assetCodes"
        news_train_adjusted['assetCodes'] = news_train_adjusted['assetCodes'].str.strip('{}').str.split(',')
        # For each assetCode in the list of "assetCodes", prepare a new row
        assetCode = news_train_adjusted.apply(lambda x: pd.Series(x['assetCodes']), axis=1).stack().reset_index(level=1, drop=True)
        assetCode.name = 'assetCode'
        # Drop 'assetCodes' column and join the new column of "assetCode" to news_train
        news_train_adjusted.drop('assetCodes', axis=1, inplace=True)
        news_train_dropped = news_train_adjusted.join(assetCode).reset_index(drop=True)
        del news_train_adjusted
        # Group news_train by "time" and "assetCode" and then compute mean on each group
        news_train_grouped = news_train_dropped.groupby(['time','assetCode'], sort=False).aggregate(np.mean).reset_index()
        del news_train_dropped
        print('Split assetCodes is done for news data!')
        # Merge two DataFrames
        merged_data.append(pd.merge(left=market_train_to_process, right= news_train_grouped, how='left', on=['time', 'assetCode'], copy=False))
    # Concatenate and return all DataFrames
    return pd.concat(merged_data, ignore_index=True, copy=False)

In [155]:
# Load market and news training data
market_train = pd.read_csv("./market_train_df.csv")
news_train = pd.read_csv("./news_train_df_1.csv")

In [None]:
# Show the assetName and assetCode correspondences in market_train
extract_asset_name_and_codes(market_train)

Unnamed: 0,assetName,assetCode
0,21Vianet Group Inc,{VNET.O}
1,2U Inc,{TWOU.O}
2,3Com Corp,{COMS.O}
3,3D Systems Corp,"{TDSC.O, DDD.N}"
4,3M Co,{MMM.N}
5,500.Com Ltd,{WBAI.N}
6,58.com Inc,{WUBA.N}
7,7 Days Group Holdings Ltd,{SVN.N}
8,8x8 Inc,{EGHT.O}
9,99 Cents Only Stores,{NDN.N}


In [None]:
# Get merged DataFrame
start_time = time.time()
merged_data = merge_data(market_train, news_train)
print("--- %s seconds ---" % (time.time() - start_time))

Convert time to datetime format is done for market data!


In [None]:
merged_data.head()

In [None]:
# Extract all records for 'Apple Inc' from the merged table
extract_by_value(merged_data, 'assetCode', 'AAPL.O')

In [None]:
merged_data[merged_data['sentimentPositive']<0]