In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime, timedelta

In [27]:
data = pd.read_csv('data/data_2023-01-11_16-56-10.csv')
data = data.drop_duplicates()
data = data[data['Ticker'].notna()]
data = data.drop(['X', '1d', '1w', '1m', '6m'], axis=1)
data = data.sort_values(by=['Ticker', 'Trade Date'])
data.head()

Unnamed: 0,Filing Date,Trade Date,Ticker,Company Name,Insider Name,Title,Trade Type,Price,Qty,Owned,ΔOwn,Value
15271,2022-09-06 16:11:29,2022-09-01,.AX,"Life360, Inc.",Rice David William,COO,S - Sale,10.0,-12962,379870,-3%,-129589
14319,2022-09-12 17:06:38,2022-09-09,.AX,"Life360, Inc.",Kapoor Samir,CTO,S - Sale,11.46,-18037,59971,-23%,-206762
6116,2022-11-18 18:33:09,2022-11-16,.AX,"Life360, Inc.",Kapoor Samir,CTO,S - Sale,14.0,-11175,51457,-18%,-156440
38807,2022-03-08 16:40:25,2022-03-04,A,"Affinity Bancshares, Inc.",Nelson Clark,"EVP, CCO",P - Purchase,15.5,500,12778,+4%,7750
37789,2022-03-14 13:40:17,2022-03-11,A,Ames National Corp,Hagan Patrick G,Dir,P - Purchase,23.75,264,5000,+6%,6270


In [28]:
def preprocess_data():
    global data
    data = data.drop(data[data['Price'] == 0.0].index)
    data = data.drop(data[data['Ticker'].str.contains('\.')].index)
    data = data.drop(data[data['Trade Type'].str.contains('Sale')].index)
    data['ΔOwn'] = data['ΔOwn'].replace('New', '+100%')
    data['ΔOwn'] = data['ΔOwn'].replace('>999%', '+999%')
    data['ΔOwn'] = data['ΔOwn'].str[:-1].astype(int)


preprocess_data()
data.head()

Unnamed: 0,Filing Date,Trade Date,Ticker,Company Name,Insider Name,Title,Trade Type,Price,Qty,Owned,ΔOwn,Value
38807,2022-03-08 16:40:25,2022-03-04,A,"Affinity Bancshares, Inc.",Nelson Clark,"EVP, CCO",P - Purchase,15.5,500,12778,4,7750
37789,2022-03-14 13:40:17,2022-03-11,A,Ames National Corp,Hagan Patrick G,Dir,P - Purchase,23.75,264,5000,6,6270
46481,2022-01-06 16:30:12,2022-01-06,AAIC,Arlington Asset Investment Corp.,Tonkel J Rock Jr,"Pres, CEO",P - Purchase,3.61,10000,774848,1,36070
34906,2022-04-01 16:15:28,2022-03-30,AAIC,Arlington Asset Investment Corp.,Tonkel J Rock Jr,"Pres, CEO",P - Purchase,3.52,20000,794848,3,70370
34328,2022-04-06 10:06:06,2022-04-04,AAIC,Arlington Asset Investment Corp.,Tonkel J Rock Jr,"Pres, CEO",P - Purchase,3.59,10000,804848,1,35870


In [29]:
data['Change'] = pd.Series(dtype='float64')


def get_change_info():
    global data
    for ticker in data['Ticker'].unique():
        only_ticker_data = data[data['Ticker'] == ticker]
        ticker_trade_date_range = only_ticker_data['Trade Date']
        first_date, last_date = ticker_trade_date_range.iloc[0], ticker_trade_date_range.iloc[-1]
        last_day_30 = (datetime.strptime(last_date, "%Y-%m-%d") + timedelta(days=30)).strftime("%Y-%m-%d")
        ticker_data = yf.download(ticker, start=first_date, end=last_day_30)
        ticker_data.reset_index(inplace=True)
        for idx, row in only_ticker_data.iterrows():
            try:
                end_date = (datetime.strptime(row['Trade Date'], "%Y-%m-%d") + timedelta(days=30))
                ticker_data_subset = ticker_data.loc[
                    (ticker_data['Date'] >= row['Trade Date']) & (ticker_data['Date'] <= end_date)]
                change = round(((ticker_data_subset['Adj Close'].max() - ticker_data_subset.iloc[0]['Adj Close']) /
                                ticker_data_subset.iloc[0]['Adj Close']) * 100, 2)
                # print(change)
                data.at[idx, 'Change'] = change
            except Exception:
                data.at[idx, 'Change'] = -1000


get_change_info()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [32]:
data.to_csv('DONT_REMOVE.csv', sep=';', index=False)

# Tworzenie modelu

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

In [2]:
data = pd.read_csv('data/DONT_REMOVE.csv', delimiter=';')
data = data.drop(data[data['Change'] == -1000].index)
data = data.drop(columns=['Filing Date', 'Trade Date', 'Ticker', 'Company Name', 'Insider Name', 'Title', 'Trade Type'])
data.head()

Unnamed: 0,Price,Qty,Owned,ΔOwn,Value,Change
0,15.5,500,12778,4,7750,4.03
1,23.75,264,5000,6,6270,6.89
2,3.61,10000,774848,1,36070,0.84
3,3.52,20000,794848,3,70370,1.72
4,3.59,10000,804848,1,35870,0.0


In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
X = data.drop(columns=['Change'])
y = data['Change']
X.shape

(14445, 5)

In [5]:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_y = scaler.fit_transform(y.values.reshape(-1, 1))

In [6]:
train_x, test_x = X.iloc[:11_000], X.iloc[11_000:]
train_y, test_y = y.iloc[:11_000], y.iloc[11_000:]

In [7]:
model = XGBRegressor()

In [8]:
model.fit(train_x, train_y)

In [9]:
preds = model.predict(test_x)

In [10]:
test_y[:10]

11236     16.00
11237     16.00
11238      0.00
11239     38.10
11240     82.35
11241     60.00
11242      0.00
11243     20.00
11244    200.00
11245     12.01
Name: Change, dtype: float64

In [11]:
preds

array([29.921925 ,  6.5776834,  5.9599133, ..., 13.535617 ,  9.8570795,
       22.831522 ], dtype=float32)

In [12]:
import matplotlib.pyplot as plt

In [13]:
%matplotlib qt
plt.plot(preds)
plt.plot(test_y.values)

[<matplotlib.lines.Line2D at 0x1be71905190>]

In [86]:
data.iloc[2320]

Price        23.75
Qty         200.00
Owned     43751.00
ΔOwn          0.00
Value      4750.00
Change        3.78
Name: 2345, dtype: float64

In [87]:
import pickle
pickle.dump(model, open('models/xgmodel.h5', 'wb'))

In [20]:
dataset = pd.read_csv('DONT_REMOVE.csv', sep=';')

In [21]:
dataset.index.max

<bound method RangeIndex.max of RangeIndex(start=0, stop=14777, step=1)>

In [28]:
sliced_data = dataset.iloc[test_x.index]
sliced_data['preds'] = preds
sliced_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sliced_data['preds'] = preds


Unnamed: 0,Filing Date,Trade Date,Ticker,Company Name,Insider Name,Title,Trade Type,Price,Qty,Owned,ΔOwn,Value,Change,preds
11236,2022-01-24 20:33:45,2022-01-24,RKFL,"Rocketfuel Blockchain, Inc.",Jensen Peter Michael,CEO,P - Purchase,0.22,34041,1228673,3,7489,16.0,29.921925
11237,2022-01-24 20:31:15,2022-01-24,RKFL,"Rocketfuel Blockchain, Inc.",Yankowitz Bennett J.,CFO,P - Purchase,0.22,4000,1434000,0,892,16.0,6.577683
11238,2022-02-16 13:35:15,2022-02-16,RKFL,"Rocketfuel Blockchain, Inc.",Yankowitz Bennett J.,CFO,P - Purchase,0.25,4000,1448417,0,1000,0.0,5.959913
11239,2022-02-24 16:39:54,2022-02-24,RKFL,"Rocketfuel Blockchain, Inc.",Yankowitz Bennett J.,CFO,P - Purchase,0.22,5000,1453417,0,1100,38.1,5.672043
11240,2022-07-18 06:15:27,2022-07-15,RKFL,"Rocketfuel Blockchain, Inc.",Jensen Peter Michael,CEO,P - Purchase,0.18,10000,1587776,1,1800,82.35,24.533602


In [29]:
sliced_data.to_csv('data/do_analizy.csv', index=False, sep=';')