# Main function

In [4]:
import requests
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup as bs4
import re
import pandas as pd
import numpy as np
import json

class MOPS_ALL_2018(object):
    
    def __init__(self, sid, year, season, rid):
        self.sid=sid
        self.year=year
        self.season=season
        self.rid=rid
        keyword={'sid':self.sid, 'y':self.year, 's':self.season, 'rid':self.rid}
        self.header= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'}
        self.url='https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID={sid}&SYEAR={y}&SSEASON={s}&REPORT_ID={rid}'.format(**keyword)
        self.get_tbl()
        
    def fetchall(self):
        self.fetch_BS()
        self.fetch_PLS()
        self.fetch_CFS()
        #self.fetch_SES()
        self.jsondata = {'data': [self.Balance, self.ProfitLoss , self.CashFlows]}
        
        return self.jsondata        
    
    def get_tbl(self):
        web_ss = requests.session()
        ss_adapter = HTTPAdapter(max_retries=3)
        web_ss.mount("https://", adapter=ss_adapter)
        res = web_ss.get(url=self.url, headers=self.header, timeout=5)
        res.encoding = 'big5'
        try:
            if res.status_code == 200:
                #soup = bs4(res.text, "lxml")
                soup = bs4(res.text, "lxml")
                self.tables = soup.select('table')
        except Exception as e:
            print(e)
            
        return self.tables
            
            
    def fetch_BS(self, Ntable=1):
        tr_list = self.tables[Ntable].select('tr')
        
        Balance_list = []
        for tr in tr_list:
            td_list = tr.select("td")
            if len(td_list) != 0:
                row = []
                for i in td_list:
                    row.append(i.text.strip().replace(',',''))
        #             print(i.text.strip().replace(',',''))
        #     print(tr.select("td"))
                if len(row) > 0:
                    Balance_list.append(row[:2])
          
#         self.Balance = pd.DataFrame(tbl, columns = ['cht','value'])
#         self.Balance = self.Balance[self.Balance['value'] != '']  
        
        self.Balance={'Balance':Balance_list}

        return self.Balance
        
    def fetch_PLS(self, Ntable=2):        
        tr_list = self.tables[Ntable].select('tr')
        
        ProfitLoss_list = []
        for tr in tr_list:
            td_list = tr.select("td")
            if len(td_list) != 0:
                row = []
                for i in td_list:
                    row.append(i.text.strip().replace(',',''))
        #             print(i.text.strip().replace(',',''))
        #     print(tr.select("td"))
                if len(row) > 0:
                    ProfitLoss_list.append(row[:2])
        
#         self.ProfitLoss = pd.DataFrame(tbl, columns = ['cht','value'])
#         self.ProfitLoss = self.ProfitLoss[self.ProfitLoss['value'] != '']  
        self.ProfitLoss={'ProfitLoss':ProfitLoss_list}
        return self.ProfitLoss
    
    
        
    def fetch_CFS(self, Ntable = 3):
        tr_list = self.tables[Ntable].select('tr')
        
        CashFlows_list = []
        for tr in tr_list:
            td_list = tr.select("td")
            if len(td_list) != 0:
                row = []
                for i in td_list:
                    row.append(i.text.strip().replace(',',''))
        #             print(i.text.strip().replace(',',''))
        #     print(tr.select("td"))
                if len(row) > 0:
                    CashFlows_list.append(row[:2])
        
#         self.CashFlows = pd.DataFrame(tbl, columns = ['cht','value'])
#         self.CashFlows = self.CashFlows[self.CashFlows['value'] != '']  
         
        self.CashFlows={'CashFlows':CashFlows_list}
        return self.CashFlows
        
        
    def fetch_SES(self, Ntable = 4):

        Etable = self.tables[Ntable]
        #print(Etable)
#         print(len(Etable.select("tr")))
        
        tr_list = Etable.select("tr")
        tbl = []
        column = []
        
        for tr in tr_list:
            td_list = tr.select("td") #row
            th_list = tr.select("th") #column
        
            row = []
            for td in td_list:
                row.append(td.text.strip().replace("\n","").replace(",",""))
            if len(row) > 0:
                tbl.append(row)
            
            for th in th_list:
                column.append(th.text.strip().replace("\n",""))
#         print(len(tbl))
#         print(column)
        self.equity_dict = {'col_index':column,'row_value':tbl}
#         print(self.equity_dict)
        return self.equity_dict
        
#         self.equity = pd.DataFrame(tbl,columns = column)


#         self.equity = self.equity.replace("", np.nan)
#         self.equity = self.equity.fillna(0)
#         self.equity['items'] = self.equity.iloc[:,0]
#         self.equity = self.equity.iloc[:,1:]
        
#         self.equity.set_index('items', inplace=True) 
#         return self.equity
#         return print(tbl)

# Fetch DATA

In [None]:
import os
import time
import random
from requests.exceptions import ConnectionError
from codes import codes


def dlcheck(path):
    filename=os.listdir(path)
    code_cap=[]
    for _file in filename:
        code_cap.append(_file[0:4])
    return code_cap

year='2017'
season='3'
path=year+'Q'+season
csvpath=year+'Q'+season+'SES'

if not os.path.exists(path):
    os.mkdir(path)

if not os.path.exists(csvpath):
    os.mkdir(csvpath)

#取得兩個資料夾的交集，以便確認是否抓過
code_cap=set(dlcheck(path)) & set(dlcheck(csvpath))
print(code_cap)


for code,v in codes.items():
    if v.type=="股票" and v.market=="上市":
        try:
            if code not in code_cap:
                rid ='C'           
                stock=MOPS_ALL_2018(code,year,season,rid)
#                 print(stock.tables)
                if len(stock.tables) < 3:
                    time.sleep(random.uniform(2,5))
                    rid = 'A'
                    stock=MOPS_ALL_2018(code,year,season,rid)
                    if len(stock.tables) < 3:
                        with open('nofinreport.txt', 'a') as f:
                            f.write('nodata:'+code+':'+stock.url+'\n')
                        print(stock.url)
                        print(code +' no finance data!')
                        continue
                    
                if stock.tables != []:
                    print('fetch BS, PLS and CFS from:' ,stock.url)
                    data = stock.fetchall()
                    filename=path+'/'+code+'-'+year+'-'+'Q'+season+'.json'
                    with open(filename, 'w', encoding='utf8') as f:
                        json.dump(data, f)
                    
                    
                    print('fetch SES from:' ,stock.url)

                    jsonname=csvpath+'/'+code+'-'+year+'-'+'Q'+season+'_ses.json'
                    ses_data = stock.fetch_SES()
                    with open(jsonname, 'w', encoding='utf8') as f:
                        json.dump(ses_data, f)
                        
                        
                else:
                    print('error-nodata:', code) 
                    with open('error.log', 'a+') as f:   
                        f.write('nodata:'+ code+':' + stock.url+'\n\n')
                time.sleep(random.uniform(2,5))             
                
        except ConnectionError:
            code_cap=set(dlcheck(path)) & set(dlcheck(csvpath))
            time.sleep(120)
            continue
print('download finished!')

{'3380', '2038', '2354', '6449', '4915', '8213', '2820', '2897', '9917', '1467', '2102', '4562', '3021', '5880', '2404', '2231', '1236', '4148', '1760', '6442', '1507', '6131', '2344', '8110', '2409', '2880', '1465', '1512', '1464', '1234', '2911', '6117', '6573', '1219', '1587', '1319', '2901', '2236', '6581', '6431', '2484', '2032', '1783', '1712', '4545', '6230', '1313', '1310', '1707', '8046', '1437', '1256', '3051', '9935', '1232', '6464', '1806', '5607', '1710', '9930', '2449', '6541', '1457', '5285', '2331', '4912', '4532', '3356', '2316', '2530', '1316', '4952', '3040', '2906', '2723', '2239', '2637', '8341', '5871', '1504', '2881', '2642', '2612', '2012', '1702', '2402', '2841', '3028', '1452', '2450', '3665', '8443', '2103', '2480', '3653', '1528', '2832', '9910', '2312', '2013', '1604', '2707', '2431', '2002', '3013', '5469', '3266', '2939', '1220', '6176', '9933', '4722', '1539', '1538', '1795', '2501', '1476', '6177', '1720', '6152', '2468', '2712', '2408', '2882', '6582',