In [34]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import *

In [6]:
df_trans_buy = pd.read_csv("trans_buy.csv", encoding = "Big5")
df_trans_stock = pd.read_csv("trans_stock.csv", encoding = "Big5")
df_trans_stock.columns = ['yyyymm', 'id_number', 'certificate', 'fund_id', 'principal', 'present_value', 'interest', 'surplus', 'buy_date', 'invest_configure']
df_trans_buy.columns = ['id_number', 'certificate', 'fund_id', 'buy_date','deduction_num', 'deduction_local_amount']
df_trans_stock['merge_key'] = df_trans_stock['yyyymm'].astype(str) + df_trans_stock['id_number'] + df_trans_stock['fund_id']


In [22]:
df_trans_stock['certificate'].value_counts()[400000: 6400000]

118236C1700002    14
034UW431500032    14
10161R41800037    14
183TV3C1800066    14
15883N51900006    14
                  ..
146J8701600030     1
028J1D41600168     1
139FW1C2000027     1
162UN831700020     1
023J0240400230     1
Name: certificate, Length: 432542, dtype: int64

In [3]:
df_trans_stock['merge_key'].value_counts().head(100)

202005Y2729879280J84    132
202004Y2729879280J84    132
202006Y2729879280J84    132
202007Y2729879280J84    121
202004N2715496490770    110
                       ... 
202006J2726915360T34     52
201910J2726915360T34     51
201605A1926728450CJ1     51
201909J2726915360T34     51
201812J2726915360T38     51
Name: merge_key, Length: 100, dtype: int64

In [26]:
df_trans_buy['certificate'].value_counts()
df_trans_buy[df_trans_buy['certificate']=='16055R51600008']

Unnamed: 0,id_number,certificate,fund_id,buy_date,deduction_num,deduction_local_amount
128615,A2718617610,16055R51600008,55R,20160215,2,21961.8
224023,N1779543590,16055R51600008,55R,20160215,55,212872.68


In [24]:
df_trans_stock[df_trans_stock['certificate']=='118236C1700002']

Unnamed: 0,yyyymm,id_number,certificate,fund_id,principal,present_value,interest,surplus,buy_date,invest_configure,merge_key
850979,201702,C1795731150,118236C1700002,236,442362.0,440564.0,0.0,-1798.0,2017-02-14 00:00:00.000,b.單筆申購,201702C1795731150236
4014973,201709,C1795731150,118236C1700002,236,451060.0,439172.0,3395.73,-8492.27,2017-02-14 00:00:00.000,b.單筆申購,201709C1795731150236
4102225,201703,C1795731150,118236C1700002,236,436239.0,435769.0,823.5,353.5,2017-02-14 00:00:00.000,b.單筆申購,201703C1795731150236
4275644,201704,C1795731150,118236C1700002,236,433323.0,433323.0,1235.25,1235.25,2017-02-14 00:00:00.000,b.單筆申購,201704C1795731150236
4679469,201706,C1795731150,118236C1700002,236,442579.0,436444.0,2092.66,-4042.34,2017-02-14 00:00:00.000,b.單筆申購,201706C1795731150236
4714602,201705,C1795731150,118236C1700002,236,440078.0,437254.0,1661.53,-1162.47,2017-02-14 00:00:00.000,b.單筆申購,201705C1795731150236
4787396,201708,C1795731150,118236C1700002,236,452448.0,439216.0,2964.6,-10267.4,2017-02-14 00:00:00.000,b.單筆申購,201708C1795731150236
5535806,201707,C1795731150,118236C1700002,236,444248.0,435021.0,2518.94,-6708.06,2017-02-14 00:00:00.000,b.單筆申購,201707C1795731150236
5964215,201711,C1795731150,118236C1700002,236,448691.0,431086.0,4248.29,-13356.71,2017-02-14 00:00:00.000,b.單筆申購,201711C1795731150236
6033726,201802,C1795731150,118236C1700002,236,458559.0,427381.0,5517.45,-25660.55,2017-02-14 00:00:00.000,b.單筆申購,201802C1795731150236


In [4]:
gain = len(df_trans_stock[df_trans_stock['surplus'] > 0].index)
total = len(df_trans_stock)
print(gain/total)

0.46403621221862434


In [31]:
date = pd.to_datetime(df_trans_stock['yyyymm'].astype(str), format="%Y%m")


NameError: name 'datetime' is not defined

In [40]:
date = date + pd.DateOffset(months=1)
df_trans_stock['n_yyyymm'] = date.dt.strftime('%Y%m')
df_trans_stock

Unnamed: 0,yyyymm,id_number,certificate,fund_id,principal,present_value,interest,surplus,buy_date,invest_configure,merge_key,n_yyyymm
0,201602,O1227254710,018MU710800562,MU7,85000.0,39910.0,0.00,-45090.00,2008-03-14 00:00:00.000,a.定時定額,201602O1227254710MU7,201606
1,201602,O1227509270,103DA131500019,DA1,26000.0,26064.0,0.00,64.00,2015-07-22 00:00:00.000,a.定時定額,201602O1227509270DA1,201606
2,201602,O1227604200,018Q1321000113,Q13,30000.0,27629.0,0.00,-2371.00,2010-02-02 00:00:00.000,b.單筆申購,201602O1227604200Q13,201606
3,201602,O1227604200,018S4800500059,S48,80000.0,169690.0,0.00,89690.00,2005-01-21 00:00:00.000,b.單筆申購,201602O1227604200S48,201606
4,201602,O1227719050,193J0T41500295,J0T,66985.0,53104.0,0.00,-13881.00,2015-06-04 00:00:00.000,b.單筆申購,201602O1227719050J0T,201606
...,...,...,...,...,...,...,...,...,...,...,...,...
16869881,202011,Y2222586790,110Z1020700203,Z10,3000.0,5718.0,0.00,2718.00,2007-07-20 00:00:00.000,b.單筆申購,202011Y2222586790Z10,202103
16869882,202011,Y2223174260,107L1701600111,L17,320000.0,250278.0,84440.00,14718.00,2016-05-24 00:00:00.000,b.單筆申購,202011Y2223174260L17,202103
16869883,202011,Y2223819530,11088U42000540,88U,288241.0,297330.0,0.00,9089.00,2020-11-19 00:00:00.000,b.單筆申購,202011Y222381953088U,202103
16869884,202011,Y2223906060,138Y3842000223,Y38,288241.0,153576.0,0.00,-134665.00,2020-08-05 00:00:00.000,b.單筆申購,202011Y2223906060Y38,202103
