In [None]:
! pip install demjson
! pip install xlsxwriter
import json
import urllib.request
import pandas as pd
import numpy as np
import demjson
import xlsxwriter

In [None]:
def scrapping_eastmoney_kcb_single_year(statement, year, page_numbers):

  data_df_year = pd.DataFrame()

  for page_number in page_numbers:

    print("scrapping data in page {}".format(page_number))

    if statement == "BS":
      url = "http://datacenter.eastmoney.com/api/data/get?type=RPT_DMSK_FN_BALANCE&sty=ALL&p={}" \
            "&ps=50&st=NOTICE_DATE,SECURITY_CODE&sr=-1,-1&var=VqQfOphg&filter=(SECURITY_TYPE_CODE=%22058001001%22)" \
            "(TRADE_MARKET_CODE=%22069001001006%22)(REPORT_DATE=%27{}-12-31%27)&rt=53280464".format(str(page_number), str(year))
    elif statement == "PL":
      url = "http://datacenter.eastmoney.com/api/data/get?type=RPT_DMSK_FN_INCOME&sty=ALL&p={}" \
            "&ps=50&st=NOTICE_DATE,SECURITY_CODE&sr=-1,-1&var=moDdrKXO&filter=(SECURITY_TYPE_CODE=%22058001001%22)" \
            "(TRADE_MARKET_CODE=%22069001001006%22)(REPORT_DATE=%27{}-12-31%27)&rt=53280464".format(str(page_number), str(year))
    elif statement == "RP":
      url = "http://datacenter.eastmoney.com/api/data/get?type=RPT_LICO_FN_CPD&sty=ALL&p={}" \
            "&ps=50&st=UPDATE_DATE,SECURITY_CODE&sr=-1,-1&var=lRYCxRdi&filter=(SECURITY_TYPE_CODE=%22058001001%22)" \
            "(TRADE_MARKET_CODE=%22069001001006%22)(REPORTDATE=%27{}-12-31%27)&rt=53280464".format(str(page_number), str(year))
    else:
      raise Exception("Plese use 'BS' for balance sheet or 'PL' for income statement or 'RP' for achievement report")

    wp = urllib.request.urlopen(url)
    data = wp.read().decode("utf -8", "ignore")
    start_pos = data.index("=")
    json_data = demjson.encode(data[start_pos+1:(-1)])
    dicti = json.loads(json.loads(json_data))

    if str(dicti["result"]) == "None":
      break
    
    if statement == "BS":
      data_df_page = pd.DataFrame(dicti["result"]["data"])[["SECURITY_CODE","SECURITY_NAME_ABBR","INDUSTRY_NAME","ACCOUNTS_RECE","INVENTORY","TOTAL_EQUITY","CURRENT_RATIO","DEBT_ASSET_RATIO"]]

      print("sample page {} data\n{}".format(page_number, data_df_page.head()))

    elif statement == "PL":
      data_df_page = pd.DataFrame(dicti["result"]["data"])[["SECURITY_CODE","PARENT_NETPROFIT","TOTAL_OPERATE_INCOME","OPERATE_PROFIT_RATIO"]]

      print("sample page {} data\n{}".format(page_number, data_df_page.head()))

    else:
      data_df_page = pd.DataFrame(dicti["result"]["data"])[["SECURITY_CODE","WEIGHTAVG_ROE","XSMLL"]]

      print("sample page {} data\n{}".format(page_number, data_df_page.head()))

    data_df_year = data_df_year.append(data_df_page)

  print("sample year {} data\n{}".format(year, data_df_year.head()))
  
  return data_df_year

In [None]:
def scrapping_eastmoney_kcb_multiple_years(statement, years, page_numbers):

  data_df_statement = pd.DataFrame()

  for year in years:

    print("scrapping year {} data".format(year))

    data_df_year = scrapping_eastmoney_kcb_single_year(statement, year, page_numbers)
    data_df_year["year"] = year
    data_df_statement = data_df_statement.append(data_df_year)
  
  print("sample {}-{} data\n{}".format(years[0], years[-1], data_df_statement.head()))
  
  return data_df_statement

In [None]:
def scrapping_eastmoney_kcb_multiple_statements(years, page_numbers):

  print("scrapping balance sheet data")

  data_df_bs = scrapping_eastmoney_kcb_multiple_years("BS", years, page_numbers)

  print("scrapping income statement data")

  data_df_pl = scrapping_eastmoney_kcb_multiple_years("PL", years, page_numbers)

  print("scrapping achievement report data")

  data_df_rp = scrapping_eastmoney_kcb_multiple_years("RP", years, page_numbers)

  df_data =  pd.merge(data_df_bs, data_df_pl, on=["SECURITY_CODE", "year"])
  df_data =  pd.merge(df_data, data_df_rp, on=["SECURITY_CODE", "year"])

  print("sample combined statements data\n{}".format(df_data.head()))

  return df_data.sort_values(by=["year", "SECURITY_CODE"], ascending=[False, True])

In [None]:
def calculate_indicators(df_data):

  df_data = df_data.dropna(axis=0, how="any")

  df_data["COGS"] = df_data["TOTAL_OPERATE_INCOME"] * (1 - df_data["XSMLL"]/100)

  df_data_indicators_complete = pd.DataFrame()
  
  indicator_writer = pd.ExcelWriter("/content/sample_data/indicators.xlsx", engine="xlsxwriter")

  for year in sorted(list(set(list(df_data["year"])))):
    
    print("calculating year {} indicators".format(year))

    df_data_beginning = df_data[df_data["year"]==year-1][["SECURITY_CODE", "INVENTORY", "ACCOUNTS_RECE"]].rename(columns={"INVENTORY":"beginning_inventory", "ACCOUNTS_RECE":"beginning_accounts_rece"})
    df_data_year = pd.merge(df_data[df_data["year"]==year-1], df_data_beginning, on="SECURITY_CODE")

    df_data_year["inventory_turnover"] = df_data_year["COGS"] *2 / (df_data_year["beginning_inventory"] + df_data_year["INVENTORY"])
    df_data_year["accounts_rece_turnover"] = df_data_year["TOTAL_OPERATE_INCOME"] * 2 / (df_data_year["ACCOUNTS_RECE"] + df_data_year["beginning_accounts_rece"])

    df_data_year = df_data_year[["SECURITY_CODE","SECURITY_NAME_ABBR","INDUSTRY_NAME","year","OPERATE_PROFIT_RATIO","WEIGHTAVG_ROE","CURRENT_RATIO","DEBT_ASSET_RATIO","accounts_rece_turnover","inventory_turnover"]]
    
    df_data_year.to_excel(indicator_writer, sheet_name=str(year), index=False)

    df_data_indicators_complete = df_data_indicators_complete.append(df_data_year)

  indicator_writer.save()
  indicator_writer.close()

  return df_data_indicators_complete




In [None]:
def normalization(df_data):
  
  indicator_normalized_writer = pd.ExcelWriter("/content/sample_data/indicators_normalized.xlsx", engine="xlsxwriter")

  df_data_normalized_complete = pd.DataFrame()

  for year in sorted(list(set(list(df_data["year"])))):
    df_data_year = df_data[df_data["year"]==year]

    df_data_year["OPERATE_PROFIT_RATIO_normalized"] = (df_data_year["OPERATE_PROFIT_RATIO"] - min(df_data_year["OPERATE_PROFIT_RATIO"]) + np.exp(-10)) / (max(df_data_year["OPERATE_PROFIT_RATIO"]) - min(df_data_year["OPERATE_PROFIT_RATIO"]))
    df_data_year["WEIGHTAVG_ROE_normalized"] = (df_data_year["WEIGHTAVG_ROE"] - min(df_data_year["WEIGHTAVG_ROE"]) + np.exp(-10)) / (max(df_data_year["WEIGHTAVG_ROE"]) - min(df_data_year["WEIGHTAVG_ROE"]))
    df_data_year["CURRENT_RATIO_normalized"] = (df_data_year["CURRENT_RATIO"] - min(df_data_year["CURRENT_RATIO"]) + np.exp(-10)) / (max(df_data_year["CURRENT_RATIO"]) - min(df_data_year["CURRENT_RATIO"]))
    df_data_year["DEBT_ASSET_RATIO_normalized"] = (max(df_data_year["DEBT_ASSET_RATIO"]) - df_data_year["DEBT_ASSET_RATIO"] + np.exp(-10)) / (max(df_data_year["DEBT_ASSET_RATIO"]) - min(df_data_year["DEBT_ASSET_RATIO"]))
    df_data_year["accounts_rece_turnover_normalized"] = (df_data_year["accounts_rece_turnover"] - min(df_data_year["accounts_rece_turnover"]) + np.exp(-10)) / (max(df_data_year["accounts_rece_turnover"]) - min(df_data_year["accounts_rece_turnover"]))
    df_data_year["inventory_turnover_normalized"] = (df_data_year["inventory_turnover"] - min(df_data_year["inventory_turnover"]) + np.exp(-10)) / (max(df_data_year["inventory_turnover"]) - min(df_data_year["inventory_turnover"]))

    df_data_year = df_data_year[["SECURITY_CODE","year","OPERATE_PROFIT_RATIO_normalized","WEIGHTAVG_ROE_normalized","CURRENT_RATIO_normalized","DEBT_ASSET_RATIO_normalized","accounts_rece_turnover_normalized","inventory_turnover_normalized"]]
    df_data_year.to_excel(indicator_normalized_writer, sheet_name=str(year), index=False)

    df_data_normalized_complete = df_data_normalized_complete.append(df_data_year)

  indicator_normalized_writer.save()
  indicator_normalized_writer.close()

  return pd.merge(df_data, df_data_normalized_complete, on=["SECURITY_CODE", "year"])

In [None]:
def calculate_weights(df_data):
  
  indicator_weights_writer = pd.ExcelWriter("/content/sample_data/indicators_weight.xlsx", engine="xlsxwriter")

  df_data_weights_complete = pd.DataFrame()

  for year in sorted(list(set(list(df_data["year"])))):
    df_data_year = df_data[df_data["year"]==year]
    
    df_data_year["OPERATE_PROFIT_RATIO_weight"] = df_data_year["OPERATE_PROFIT_RATIO_normalized"] / sum(df_data_year["OPERATE_PROFIT_RATIO_normalized"])
    df_data_year["WEIGHTAVG_ROE_weight"] = df_data_year["WEIGHTAVG_ROE_normalized"] / sum(df_data_year["WEIGHTAVG_ROE_normalized"])
    df_data_year["CURRENT_RATIO_weight"] = df_data_year["CURRENT_RATIO_normalized"] / sum(df_data_year["CURRENT_RATIO_normalized"])
    df_data_year["DEBT_ASSET_RATIO_weight"] = df_data_year["DEBT_ASSET_RATIO_normalized"] / sum(df_data_year["DEBT_ASSET_RATIO_normalized"])
    df_data_year["accounts_rece_turnover_weight"] = df_data_year["accounts_rece_turnover_normalized"] / sum(df_data_year["accounts_rece_turnover_normalized"])
    df_data_year["inventory_turnover_weight"] = df_data_year["inventory_turnover_normalized"] / sum(df_data_year["inventory_turnover_normalized"])

    df_data_year = df_data_year[["SECURITY_CODE","year","OPERATE_PROFIT_RATIO_weight","WEIGHTAVG_ROE_weight","CURRENT_RATIO_weight","DEBT_ASSET_RATIO_weight","accounts_rece_turnover_weight","inventory_turnover_weight"]]
    df_data_year.to_excel(indicator_weights_writer, sheet_name=str(year), index=False)

    df_data_weights_complete = df_data_weights_complete.append(df_data_year)

  indicator_weights_writer.save()
  indicator_weights_writer.close()
  
  return pd.merge(df_data, df_data_weights_complete, on=["SECURITY_CODE", "year"])

In [None]:
def calculate_shang(df_data):

  shang_writer = pd.ExcelWriter("/content/sample_data/shang.xlsx", engine="xlsxwriter")
  df_data_shang_complete = pd.DataFrame()

  for year in sorted(list(set(list(df_data["year"])))):
    df_data_year = df_data[df_data["year"]==year]
    n = len(df_data_year.index)
    OPERATE_PROFIT_RATIO_shang_rong = 1 + 1 / np.log(n) * sum(df_data_year["OPERATE_PROFIT_RATIO_normalized"] * np.log(df_data_year["OPERATE_PROFIT_RATIO_normalized"]))
    WEIGHTAVG_ROE_shang_rong = 1 + 1 / np.log(n) * sum(df_data_year["WEIGHTAVG_ROE_normalized"] * np.log(df_data_year["WEIGHTAVG_ROE_normalized"]))
    CURRENT_RATIO_shang_rong = 1 + 1 / np.log(n) * sum(df_data_year["CURRENT_RATIO_normalized"] * np.log(df_data_year["CURRENT_RATIO_normalized"]))
    DEBT_ASSET_RATIO_shang_rong = 1 + 1 / np.log(n) * sum(df_data_year["DEBT_ASSET_RATIO_normalized"] * np.log(df_data_year["DEBT_ASSET_RATIO_normalized"]))
    accounts_rece_turnover_shang_rong = 1 + 1 / np.log(n) * sum(df_data_year["accounts_rece_turnover_normalized"] * np.log(df_data_year["accounts_rece_turnover_normalized"]))
    inventory_turnover_shang_rong = 1 + 1 / np.log(n) * sum(df_data_year["inventory_turnover_normalized"] * np.log(df_data_year["inventory_turnover_normalized"]))
    print(n)
    print(np.log(df_data_year["OPERATE_PROFIT_RATIO_normalized"]))
    print(sum(df_data_year["OPERATE_PROFIT_RATIO_normalized"] * np.log(df_data_year["OPERATE_PROFIT_RATIO_normalized"])))
    print(1 / np.log(n) * sum(df_data_year["OPERATE_PROFIT_RATIO_normalized"] * np.log(df_data_year["OPERATE_PROFIT_RATIO_normalized"])))
    print("WEIGHTAVG_ROE_shang_rong: {}".format(WEIGHTAVG_ROE_shang_rong))
    total_weight = OPERATE_PROFIT_RATIO_shang_rong + WEIGHTAVG_ROE_shang_rong + CURRENT_RATIO_shang_rong + DEBT_ASSET_RATIO_shang_rong + accounts_rece_turnover_shang_rong + inventory_turnover_shang_rong
    OPERATE_PROFIT_RATIO_weight = OPERATE_PROFIT_RATIO_shang_rong / total_weight
    WEIGHTAVG_ROE_weight = WEIGHTAVG_ROE_shang_rong / total_weight
    CURRENT_RATIO_weight =  CURRENT_RATIO_shang_rong / total_weight
    DEBT_ASSET_RATIO_weight = DEBT_ASSET_RATIO_shang_rong / total_weight
    accounts_rece_turnover_weight = accounts_rece_turnover_shang_rong / total_weight
    inventory_turnover_weight = inventory_turnover_shang_rong / total_weight

    df_data_year["shang"] = OPERATE_PROFIT_RATIO_weight * df_data_year["OPERATE_PROFIT_RATIO_weight"] + WEIGHTAVG_ROE_weight * df_data_year["WEIGHTAVG_ROE_weight"] + \
                                CURRENT_RATIO_weight * df_data_year["CURRENT_RATIO_weight"] + DEBT_ASSET_RATIO_weight * df_data_year["DEBT_ASSET_RATIO_weight"] + \
                                accounts_rece_turnover_weight * df_data_year["accounts_rece_turnover_weight"] + inventory_turnover_weight * df_data_year["inventory_turnover_weight"]
    
    df_data_shang_complete = df_data_shang_complete.append(df_data_year[["SECURITY_CODE","SECURITY_NAME_ABBR","INDUSTRY_NAME","year","shang"]])
  
  df_data_shang_complete.sort_values(by=["SECURITY_CODE", "year"]).to_excel(shang_writer, index=False)

  shang_writer.save()
  shang_writer.close()

In [None]:
def main():
  DfData = scrapping_eastmoney_kcb_multiple_statements(range(2010, 2020), range(1, 5))
  DfData = calculate_indicators(DfData)
  DfData = normalization(DfData)
  DfData = calculate_weights(DfData)
  calculate_shang(DfData)

In [None]:
main()