# Outcomes


This is an add on for Classification/Ensemble Methods since we ran into an issue with continuous data type for the label - which expected a binary outcome instead. In order to circumvent this problem, we decided to create a new column called "Outcomes" which compared "curr_open" and "curr_close". If "curr_close" > Open/Current Opening Stock Price, the "Outcomes" column was assigned with a 1 and vice versa. 

In [1]:
import pandas as pd
import pickle

In [2]:
msft_official_df = pd.read_pickle('final_features.pkl')
msft_labels_updated = pd.read_pickle('final_labels.pkl')

In [3]:
msft_official_df.head()

Unnamed: 0,past_open,past_high,past_low,past_close,past_volume,past_stock_splits,curr_open,SMA_10,SMA_50,SMA_100,sentiment,interaction_num
0,-1.281909,-1.281896,-1.283973,-1.27923,-0.318316,0.0,-1.275513,-1.266864,-1.308844,-1.246069,0.134796,0.000644
1,-1.274124,-1.275123,-1.272102,-1.271966,-0.639944,0.0,-1.274475,-1.265836,-1.308391,-1.245196,0.0,0.000318
2,-1.273086,-1.273687,-1.270211,-1.271863,-0.635337,0.0,-1.273645,-1.265391,-1.306719,-1.244642,0.151904,0.001376
3,-1.272256,-1.270198,-1.26979,-1.268854,-0.018263,0.0,-1.271673,-1.265116,-1.304247,-1.244015,0.184977,0.000586
4,-1.270284,-1.263323,-1.26874,-1.260449,-0.4086,0.0,-1.259842,-1.261588,-1.301087,-1.243183,0.294865,0.001276


In [4]:
mstf_not_scale = pd.read_pickle('msft_features_updated.pkl')

In [5]:
mstf_not_scale.head()

Unnamed: 0,past_open,past_high,past_low,past_close,past_volume,past_stock_splits,yesterday,curr_open,today
0,48.972503,49.341801,48.423059,49.224709,24944300.0,0.0,2016-04-12,49.64805,2016-04-13
1,49.64805,49.936283,49.440884,49.855217,20818000.0,0.0,2016-04-13,49.738134,2016-04-14
2,49.738134,50.062396,49.603023,49.864235,20877100.0,0.0,2016-04-14,49.810182,2016-04-15
3,49.810182,50.368632,49.639045,50.125439,28793800.0,0.0,2016-04-15,49.98132,2016-04-18
4,49.98132,50.972118,49.729114,50.855022,23786000.0,0.0,2016-04-18,51.008153,2016-04-19


In [6]:
mstf_not_scale = mstf_not_scale.drop(columns=['today'])

We subsetted the data from April 12, 2016 to December 31, 2019 based on the availability of the Kaggle datasets and the data collected from yfinance. 

In [7]:
mstf_not_scale = mstf_not_scale[mstf_not_scale['yesterday'] <= '2019-12-31']

In [8]:
mstf_not_scale = mstf_not_scale.drop(mstf_not_scale.index[-1])

In [9]:
mstf_not_scale.head()

Unnamed: 0,past_open,past_high,past_low,past_close,past_volume,past_stock_splits,yesterday,curr_open
0,48.972503,49.341801,48.423059,49.224709,24944300.0,0.0,2016-04-12,49.64805
1,49.64805,49.936283,49.440884,49.855217,20818000.0,0.0,2016-04-13,49.738134
2,49.738134,50.062396,49.603023,49.864235,20877100.0,0.0,2016-04-14,49.810182
3,49.810182,50.368632,49.639045,50.125439,28793800.0,0.0,2016-04-15,49.98132
4,49.98132,50.972118,49.729114,50.855022,23786000.0,0.0,2016-04-18,51.008153


In [10]:
msft_labels_updated.head()

Unnamed: 0,Close
0,49.855217
1,49.864235
2,50.125439
3,50.855022
4,50.791977


In [11]:
merged_df = pd.concat([mstf_not_scale, msft_labels_updated], axis=1)

In [12]:
merged_df['outcome'] = (merged_df['Close'] > merged_df['curr_open']).astype(int)

In [13]:
merged_df.head()

Unnamed: 0,past_open,past_high,past_low,past_close,past_volume,past_stock_splits,yesterday,curr_open,Close,outcome
0,48.972503,49.341801,48.423059,49.224709,24944300.0,0.0,2016-04-12,49.64805,49.855217,1
1,49.64805,49.936283,49.440884,49.855217,20818000.0,0.0,2016-04-13,49.738134,49.864235,1
2,49.738134,50.062396,49.603023,49.864235,20877100.0,0.0,2016-04-14,49.810182,50.125439,1
3,49.810182,50.368632,49.639045,50.125439,28793800.0,0.0,2016-04-15,49.98132,50.855022,1
4,49.98132,50.972118,49.729114,50.855022,23786000.0,0.0,2016-04-18,51.008153,50.791977,0


From this completed column of outcomes, we were able to implement this into our notebook, **Classification/Ensemble Methods** and use the "Outcomes" for our labels. *This worked and we were able to create a classification report as well.

In [21]:
merged_df['outcome'].to_pickle('outcome.pkl')

In [14]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [15]:
msft_official_df.columns

Index(['past_open', 'past_high', 'past_low', 'past_close', 'past_volume',
       'past_stock_splits', 'curr_open', 'SMA_10', 'SMA_50', 'SMA_100',
       'sentiment', 'interaction_num'],
      dtype='object')

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# First extract our test data and store it in x_test, y_test
features = msft_official_df.to_numpy()  
labels = merged_df['outcome'].to_numpy()  
_x, x_test, _y, y_test = train_test_split(features, labels, test_size=0.20, random_state=42)

train_features = _x
train_labels = _y

test_features = x_test
test_labels = y_test


In [17]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svm_linear = SVC(kernel="linear")

svm_linear.fit(train_features,train_labels)

# now predict and compute acuracy
test_actual = test_labels
test_predicted = svm_linear.predict(test_features)

print (classification_report(test_actual,test_predicted))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        88
           1       0.53      1.00      0.69       100

    accuracy                           0.53       188
   macro avg       0.27      0.50      0.35       188
weighted avg       0.28      0.53      0.37       188



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Generate some random continuous data
X = msft_official_df # 1000 samples with 5 features each
y = msft_labels_updated # Continuous target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and test different classifiers
models = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosted Trees', GradientBoostingRegressor()),
    ('Support Vector Machine', SVR()),
    ('Neural Network', MLPRegressor())
]

for name, model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Test the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print the results
    print(name)
    print(f'Mean squared error: {mse:.2f}')
    print(f'R^2 score: {r2:.2f}')
    print('\n')

Linear Regression
Mean squared error: 1.04
R^2 score: 1.00




  model.fit(X_train, y_train)


Decision Tree
Mean squared error: 2.25
R^2 score: 1.00




Random Forest
Mean squared error: 1.28
R^2 score: 1.00




  y = column_or_1d(y, warn=True)


Gradient Boosted Trees
Mean squared error: 1.51
R^2 score: 1.00




  y = column_or_1d(y, warn=True)


Support Vector Machine
Mean squared error: 22.13
R^2 score: 0.97




  y = column_or_1d(y, warn=True)


Neural Network
Mean squared error: 26.93
R^2 score: 0.97




In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier


#lr_vanilla = LogisticRegression(penalty="none") # OR penalty="none" depending on your sklearn version
svm_linear = SVC(kernel="linear")
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(random_state=23) # some random seed for reproducibility
grad_boost = GradientBoostingClassifier()
#voting = VotingClassifier(estimators=[("1",lr_vanilla),("2",svm_linear),("3",dt)])

#"lr":lr_vanilla, #"voting":voting}
all_models = {"svm":svm_linear,
              "decision_tree":dt,
              "random_forest":rf,
              "grad_boost":grad_boost}
            

print (f"We are working with classifiers {all_models.keys()}")

We are working with classifiers dict_keys(['svm', 'decision_tree', 'random_forest', 'grad_boost'])


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# First extract our test data and store it in x_test, y_test
features = msft_official_df.to_numpy()  
labels = msft_labels_updated.to_numpy()  
_x, x_test, _y, y_test = train_test_split(features, labels, test_size=0.10, random_state=42)

k = 5 # 5-fold

# We can use sklearn's cross validation score directly
# We can speed up training using n_jobs parameter which specifies how many cpu_cores to use

best_model_name = ""
best_model_valid_accuracy = 0
best_model = None

for model_name in all_models.keys():
    model = all_models[model_name]
    cv_scores = cross_val_score(model,_x,_y.flatten(), cv=k, n_jobs=4, error_score='raise')
    average_cv_score = cv_scores.mean()
    print (f"Mean cross validation accuracy for model {model_name} = {average_cv_score}")

    if average_cv_score > best_model_valid_accuracy :
        best_model_name = model_name
        best_model_valid_accuracy  = average_cv_score
        best_model = model

print (f"Best model is {best_model_name} with {k}-fold accuracy of {best_model_valid_accuracy}")

ValueError: Unknown label type: 'continuous'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6f501a8e-f6e4-43a9-b4a2-6516b2e3d333' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>