In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from kaggle.competitions import twosigmanews
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost
from sklearn.metrics import explained_variance_score

In [None]:
env = twosigmanews.make_env()
(market_data_orig, news_data_orig) = env.get_training_data()

In [None]:
#keeping the originals as is
market_data = market_data_orig.copy()
news_data = news_data_orig.copy()
asset_code_name_map = market_data.set_index('assetCode').to_dict()['assetName']
asset_name_code_map = market_data.set_index('assetName').to_dict()['assetCode']

In [None]:
#Preprocessing Level 1
def preprocess_news_data(news_data):
    news_data['firstMentionLoc'] = 1- news_data['firstMentionSentence']/news_data['sentenceCount']
    news_data['sentimentWordFraction'] = news_data['sentimentWordCount']/news_data['wordCount']
    news_data = news_data.drop(columns = ['noveltyCount12H', 'noveltyCount24H',
                                          'noveltyCount3D', 'noveltyCount5D',
                                          'noveltyCount7D', 'volumeCounts12H',
                                          'volumeCounts24H', 'volumeCounts3D',
                                          'volumeCounts5D', 'volumeCounts7D',
                                          'bodySize', 'audiences', 'subjects',
                                          'takeSequence', 'headline', 'sourceId',
                                          'firstCreated', 'sourceTimestamp',
                                          'firstMentionSentence', 'sentenceCount',
                                          'sentimentWordCount', 'wordCount','companyCount',
                                          'sentimentClass','headlineTag' ])
#Think about adding sentimentClass,headline and using it as a categorical variable
    news_data['time'] = pd.to_datetime(news_data['time']).dt.date.astype(str)
    news_data = news_data.groupby(['time','urgency', 'provider',
                                   'marketCommentary', 'assetCodes','assetName'
                                  ],as_index=False).agg([np.mean,np.min,np.max]).reset_index()
    news_data.columns = [''.join(c) for c in news_data.columns]
    news_data['assetCodes'] = news_data['assetCodes'].astype(str)
    news_data['assetCodes'] = news_data['assetCodes'].map(lambda x: eval(x.replace('{','[').replace('}',']')))
    return news_data

def group_providers(news_data):
    provider_data = pd.DataFrame(news_data.provider.value_counts())
    provider_data.reset_index(inplace=True)
    provider_data['map'] = provider_data['index'].astype(str)
    provider_data.loc[provider_data.provider < provider_data['provider'][2], 'map'] = 'other'
    provider_data.drop(columns=['provider'],inplace=True)
    news_data = pd.merge(news_data,provider_data,how='left',left_on='provider', right_on='index')
    news_data.provider = news_data.map
    news_data.drop(columns=['index','map'],inplace=True)
    return news_data

def preprocess_asset_code_news_data(news_data,asset_code_name_map,asset_name_code_map):
    #Preprocessing Level 2
    news_data['assetCodes'] = news_data.apply(lambda row : [c for c in row.assetCodes if c in asset_code_name_map],axis=1)
    news_data['countassetCodes'] = news_data.apply(lambda row: len(row.assetCodes), axis=1)
    single_code_data = news_data[news_data.countassetCodes == 1]
    single_code_data['assetCode'] = single_code_data.apply(lambda x: x.assetCodes[0],axis=1)
    double_code_data = news_data[news_data.countassetCodes == 2]
    double_code_data_i = double_code_data.copy()
    double_code_data_ii = double_code_data.copy()
    if len(double_code_data_i) == 0:
        double_code_data_i['assetCode'] = ''
    else :
        double_code_data_i['assetCode'] = double_code_data_i.apply(lambda x: x.assetCodes[0],axis=1)
    if len(double_code_data_ii) == 0:
        double_code_data_ii['assetCode'] = ''
    else :
        double_code_data_ii['assetCode'] = double_code_data_ii.apply(lambda x: x.assetCodes[1],axis=1)
    unmapped_code_data = news_data[news_data.countassetCodes == 0]
    if len(unmapped_code_data) == 0 :
        unmapped_code_data['assetCode'] = ''
    else :
        unmapped_code_data['assetCode']  = unmapped_code_data.apply(lambda row: asset_name_code_map.get(row.assetName,''),axis=1)
    #ADD ALL THE TICKERS CORRESPONDING TO ONE ASSET NAME
    mapped_data = unmapped_code_data[unmapped_code_data.assetCode != '']
    #Concatenate the data to make it complete
    complete_news_data = pd.concat([single_code_data, double_code_data_i, double_code_data_ii, mapped_data],ignore_index=True)
    complete_news_data.drop(columns=['assetCodes','countassetCodes', 'assetName'],inplace=True)
    complete_news_data.drop_duplicates(inplace=True)
    return complete_news_data

news_data = preprocess_news_data(news_data)
news_data = group_providers(news_data)
complete_news_data = preprocess_asset_code_news_data(news_data,asset_code_name_map,asset_name_code_map)

In [None]:
#Market Data Preprocessing
def preprocess_market_data(market_data,is_test_data=False):
    market_data[ 'returnsOpenClose'] = market_data.close/market_data.open -1
    drop_columns = ['close','open','assetName']
    if is_test_data == False:
        drop_columns = drop_columns+ ['universe']
    market_data.drop(columns=drop_columns,inplace=True)
    market_data['time'] = pd.to_datetime(market_data['time']).dt.date.astype(str)
    return market_data
market_data = preprocess_market_data(market_data)

In [None]:
clean = market_data[market_data.returnsOpenNextMktres10 <= 1]
clean_data = clean[clean.returnsOpenNextMktres10 >= -1]
positive_data = clean_data[clean_data.returnsOpenNextMktres10>=0]
clean_y = clean_data.returnsOpenNextMktres10
np.quantile(clean_y,[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0])
#plt.hist(clean_data.returnsOpenNextMktres10,bins=300)

In [None]:
#This is where modeling and calibration starts
#Merging Market and News Data
def merge_market_news_data(market_data,news_data):
    combined_data = pd.merge(market_data,news_data,how='inner',left_on=['time','assetCode'], right_on = ['time','assetCode'])
    #Two categorical variables: urgency,provider
    combined_data['marketCommentary'] = combined_data['marketCommentary'].astype('int')
    combined_hot_encoded = pd.get_dummies(combined_data,prefix=['urgency','provider'],columns=['urgency','provider'])
    combined_hot_encoded.drop_duplicates(inplace=True)
    combined_hot_encoded.dropna(inplace=True)
    return combined_hot_encoded
combined_data = merge_market_news_data(market_data,complete_news_data)

In [None]:
combined_features = combined_data.iloc[:,3:].columns.tolist()
combined_features = list(set(combined_features) - set(['returnsOpenNextMktres10']))
combined_features 

In [None]:
market_features = market_data.iloc[:,3:].columns.tolist()
market_features = list(set(market_features) - set(['returnsOpenNextMktres10']))
market_features

In [None]:
#build a XGBoost regression just using market data
def market_data_XGB(market_data,market_features):
    market_data_copy = market_data.copy()
    market_data_copy.dropna(inplace=True)
    for c in market_features:
        market_data_copy = market_data_copy[market_data_copy[c] < 1.0]
        market_data_copy = market_data_copy[market_data_copy[c] > -1.0]
    market_data_copy = market_data_copy[market_data_copy['returnsOpenNextMktres10'] < 1.0]
    market_data_copy = market_data_copy[market_data_copy['returnsOpenNextMktres10'] > -1.0]
    X_data = market_data_copy[market_features].values
    y_data = market_data_copy[['returnsOpenNextMktres10']].values
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data ,test_size=0.3)
    xgb = xgboost.XGBRegressor(n_estimators=300, learning_rate=0.1, gamma=0, subsample=0.75,
                               colsample_bytree=1, max_depth=7)
    xgb.fit(X_train,y_train,eval_metric='rmse',eval_set=[(X_test,y_test)],early_stopping_rounds=10)
    return xgb
market_XGB = market_data_XGB(market_data,market_features)

In [None]:
#build a XGBoost regression for combined data
def combined_data_XGB(combined_data,combined_features):
    combined_data_copy = combined_data.copy()
    combined_data_copy.dropna(inplace=True)
    for c in combined_features:
        combined_data_copy = combined_data_copy[combined_data_copy[c] <= 1.0]
        combined_data_copy = combined_data_copy[combined_data_copy[c] >= -1.0]
    combined_data_copy = combined_data_copy[combined_data_copy['returnsOpenNextMktres10'] < 1.0]
    combined_data_copy = combined_data_copy[combined_data_copy['returnsOpenNextMktres10'] > -1.0]
    X_data = combined_data_copy[combined_features].values
    y_data = combined_data_copy[['returnsOpenNextMktres10']].values
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data ,test_size=0.3)
    xgb = xgboost.XGBRegressor(n_estimators=200, learning_rate=0.1, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
    xgb.fit(X_train,y_train,eval_metric='rmse',eval_set=[(X_test,y_test)],early_stopping_rounds=10)
    return xgb

combined_XGB = combined_data_XGB(combined_data,combined_features)

In [None]:
days = env.get_prediction_days()

In [None]:
def process_test_data(market_test,news_test,asset_code_name_map,asset_name_code_map,combined_features):
    market_test_data = preprocess_market_data(market_test,is_test_data=True)
    news_test_data = preprocess_news_data(news_test)
    news_test_data = group_providers(news_test_data)
    complete_news_test_data = preprocess_asset_code_news_data(news_test_data,asset_code_name_map,asset_name_code_map)
    combined_test_data = merge_market_news_data(market_test_data,complete_news_test_data)
    present_columns = combined_test_data.columns
    remaining_columns = list(set(combined_features) - set(present_columns))
    for c in remaining_columns:
        combined_test_data[c] = 0
    return combined_test_data,market_test_data

def calculate_weight(pred_return):
    if pred_return < 0 :
        abs_pred_return = -pred_return
    else :
        abs_pred_return = pred_return
    if abs_pred_return < 0.1:
        result = 1
    else :
        result = 0.1/abs_pred_return
    return np.sign(pred_return)*result

def assign_appropriate_weight(array):
    result = np.zeros(array.shape)
    for i in range(len(array)):
        result[i] = calculate_weight(array[i])
    return result

def make_news_based_predictions(combined_test_data,combined_features,xgb):
    combined_test_data['pred_return'] = 0
    X_test_data = combined_test_data[combined_features].values
    predictions = xgb.predict(X_test_data)
    combined_test_data.pred_return = predictions
    combined_submit_df = combined_test_data[['assetCode','pred_return']]
    combined_submit_df = combined_submit_df.groupby(['assetCode'],as_index=False).agg([np.mean]).reset_index()
    combined_submit_df.columns = [''.join(c) for c in combined_submit_df.columns]
    pred_returns = combined_submit_df.pred_returnmean.values
    weight_vector = assign_appropriate_weight(pred_returns)
    combined_submit_df['confidenceValue'] = weight_vector
    combined_submit_df = combined_submit_df[['assetCode','confidenceValue']]
    return combined_submit_df

#combined_test_data,market_test_data = process_test_data(market_test,news_test,asset_code_name_map,asset_name_code_map,combined_features)
#news_submission_df = make_news_based_predictions(combined_test_data,combined_features,combined_XGB)
#assets_predicted = list(news_submission_df.assetCode)
#relevant_market_test_data = market_test_data[~market_test_data['assetCode'].isin(assets_predicted)]
#market_submission_df = make_news_based_predictions(relevant_market_test_data,market_features,market_XGB)
#submit_df = pd.concat([news_submission_df,market_submission_df],ignore_index=True)

#print( 'size matches as expected : ', len(submit_df) == len(predictions_template_df))

In [None]:
count = 1
for (market_test, news_test, predictions_template_df) in days:
    combined_test_data,market_test_data = process_test_data(market_test,news_test,asset_code_name_map,asset_name_code_map,combined_features)
    news_submission_df = make_news_based_predictions(combined_test_data,combined_features,combined_XGB)
    assets_predicted = list(news_submission_df.assetCode)
    relevant_market_test_data = market_test_data[~market_test_data['assetCode'].isin(assets_predicted)]
    market_submission_df = make_news_based_predictions(relevant_market_test_data,market_features,market_XGB)
    submit_df = pd.concat([news_submission_df,market_submission_df],ignore_index=True)
    print( 'size matches as expected : ', len(submit_df) == len(predictions_template_df))
    env.predict(submit_df)
    print( "prediction complete for day : ", count)
    count = count+1
env.write_submission_file()