# Refinitiv Raw Data Processing Script

In [1]:
import pandas as pd
import numpy as np
import os

### Constants

In [2]:
YEARS = (2020, 2021)

RAW_DATA_DIR = '../02_RAW/TimeSeries_EU17_20-21/00 Raw/'
#RAW_DATA_DIR = '../02_RAW/TimeSeries_EU17_20-21/02 ScriptTest/'
OUTPUT_DIR = (f'../99_Final Dataset/{YEARS[0]}/', f'../99_Final Dataset/{YEARS[1]}/')

NA_VALUES = ['$$ER: 4540,NO DATA VALUES FOUND', '$$ER: E100,NO WORLDSCOPE DATA FOR THIS CODE', 
             '$$ER: 0904,NO DATA AVAILABLE', '$$ER: 0554,INPUT QUOTE IS DEAD QUOTE',
             '$$ER: E100,INVALID CODE OR EXPRESSION ENTERED', '$$ER: 0555,NO VALID QUOTES WERE FOUND',
             '$$ER: 0711,NO INPUT PARAMETERS SUPPLIED', '$$ER: 9898,NO DATA AVAILABLE FROM AIBESFO',
             '$$ER: 1929,NO DATA FOUND', '$$ER: 1909,NO DATA FOUND', '$$ER: 2361,NO DATA AVAILABLE',
             '$$ER: 2311,NO DATA AVAILABLE', '.', '$$ER: E100,ACCESS DENIED ',
             '$$ER: 0595,APRIMQTE - IDMS ERROR1601X FOR RCSECUR', '$$ER: 2365,INCOMPLETE HISTORY',
             '$$ER: 2308,NO DATA TO RETURN', '$$ER: 0380,        RECORD NOT FOUND',
             '$$ER: 0593,APRIMQTE - IDMS ERROR0307X FOR RCSECUR', '$$ER: 0629,908380         NO DATA AVAILABLE',
             '$$ER: 0629,197025         NO DATA AVAILABLE', '$$ER: 0629,957454         NO DATA AVAILABLE',
             '$$ER: 0629,845794         NO DATA AVAILABLE', '$$ER: 0629,902182         NO DATA AVAILABLE',
             '$$ER: 9098,NO DATA AVAILABLE',
             'DSABEND ********************************************************************************DATASTREAM ONLINE SERVICE********************************************************************************Your current enquiry has failed and the system has invoked necessary actionto terminate it. This may be due to data errors, application software faults,system problems or communications problems.Please contact your Datastream Customer Services Helpline on the followingtelephone numbers :-HELPLINE  FRANKFURT     069  71 40 5333HELPLINE  FRANKFURT  (49 69) 71 40 5333Please quote the following details :-Error code       :  S0C1Task  code       :  900E                Program name     :  DS4900Userid           :  XPUV001             Terminal number  :  52576Reference number :  P2/0289             System Jobname   :  DLIVE3P2           Ã°', 'DSABEND ********************************************************************************DATASTREAM ONLINE SERVICE********************************************************************************Your current enquiry has failed and the system has invoked necessary actionto terminate it. This may be due to data errors, application software faults,system problems or communications problems.Please contact your Datastream Customer Services Helpline on the followingtelephone numbers :-HELPLINE  FRANKFURT     069  71 40 5333HELPLINE  FRANKFURT  (49 69) 71 40 5333Please quote the following details :-Error code       :  S0C1Task  code       :  900E                Program name     :  DS4900Userid           :  XPUV001             Terminal number  :  52576Reference number :  P2/0289             System Jobname   :  DLIVE3P2           ð']

CHUNK_SIZE = len(YEARS) + 2         

In [3]:
WS_CODES = [
        "DATATYPE",
        "WC07531",
        "WC07536",
        "X( WC03040)~E",
        "WC08131",
        "X( WC18224)~E",
        "X( WC01150)~E",
        "X( WC04050)~E",
        "X( WC01149)~E",
        "X( WC05491)~E",
        "X( WC05476)~E",
        "X( WC04601)~E",
        "X( WC02003)~E",
        "X( WC02005)~E",
        "X( WC02001)~E",
        "X( WC04551)~E",
        "X( WC05501)~E",
        "X( WC05502)~E",
        "WC08311",
        "WC08021",
        "X( WC05376)~E",
        "X( WC03501)~E",
        "WC05301",
        "WC05192",
        "WC05191",
        "WC05194",
        "X( WC03480)~E",
        "X( WC04751)~E",
        "X( WC01051)~E",
        "WC06099",
        "X( WC02201)~E",
        "X( WC03101)~E",
        "WC08106",
        "WC07034",
        "WC05350",
        "X( WC03263)~E",
        "X( WC04049)~E",
        "X( WC01151)~E",
        "WC09504",
        "X( WC05101)~E",
        "X( WC05110)~E",
        "X( WC18192)~E",
        "X( WC05240)~E",
        "X( WC18191)~E",
        "X( WC18198)~E",
        "X( WC05201)~E",
        "X( WC18193)~E",
        "X( WC10010)~E",
        "X( WC05210)~E",
        "X( WC05202)~E",
        "X( WC10030)~E",
        "X( WC18209)~E",
        "WC07011",
        "X( WC18100)~E",
        "X( WC01601)~E",
        "X( WC01254)~E",
        "X( WC01253)~E",
        "X( WC05290)~E",
        "X( WC04201)~E",
        "X( WC01100)~E",
        "X( WC18225)~E",
        "X( WC18226)~E",
        "X( WC01451)~E",
        "X( WC04851)~E",
        "WC11556",
        "X( WC01255)~E",
        "X( WC01251)~E",
        "WC08126",
        "X( WC02101)~E",
        "WC05351",
        "X( WC03251)~E",
        "X( WC08001)~E",
        "X( WC03426)~E",
        "X( WC01501)~E",
        "X( WC04890)~E",
        "X( WC04870)~E",
        "X( WC04860)~E",
        "X( WC18199)~E",
        "X( WC01706)~E",
        "X( WC01751)~E",
        "X( WC01551)~E",
        "X( WC01651)~E",
        "X( WC01705)~E",
        "WC08366",
        "X( WC04251)~E",
        "X( WC01001)~E",
        "X( WC01266)~E",
        "X( WC01249)~E",
        "X( WC01250)~E",
        "WC08316",
        "X( WC02250)~E",
        "X( WC05309)~E",
        "X( WC01701)~E",
        "X( WC03451)~E",
        "X( WC01401)~E",
        "WC08321",
        "X( WC02501)~E",
        "WC08101",
        "X( WC02051)~E",
        "X( WC01201)~E",
        "X( WC03495)~E",
        "WC08301",
        "WC08376",
        "X( WC01084)~E",
        "X( WC05508)~E",
        "X( WC01101)~E",
        "X( WC03051)~E",
        "WC05576",
        "WC05575",
        "X( WC02999)~E",
        "X( WC03998)~E",
        "X( WC03255)~E",
        "WC08231",
        "WC08221",
        "X( WC02649)~E",
        "X( WC03351)~E",
        "X( WC03999)~E",
        "X( WC03995)~E",
        "X( WC03151)~E",
        "WC07015",
        "X(MV)~E",
        "X(MVC)~E",
        "WC07210",
        "WC07240",
        "WC07230",
        "WC08416",
        "WC05475",
        "X(UDDE)~E",
        "DY",
        "DT",
        "DTAX",
        "X(UDD)~E",
        "X(DPS)~E",
        "XDD",
        "PYD",
        "354E",
        "WC08246",
        "WC08250",
        "WC08735",
        "WC08731",
        "X(WC07101)~E",
        "X(WC07151)~E",
        "X(WC07126)~E",
        "WC08736",
        "WC08741",
        "WC04355",
        "X(WC04355)~E"
    ]

STATA_VARS = [
        "year",
        "ACCOUNTINGMETHODFORLONGTER",
        "ACCOUNTINGSTANDARDSFOLLOWED",
        "ACCOUNTSPAYABLE",
        "ACCOUNTSRECEIVABLESDAYS",
        "AMORTIMPAIROFGOODWILL",
        "AMORTIZATIONOFDEFERREDCHARG",
        "AMORTIZATIONINTANGIBLEASSETS",
        "AMORTIZATIONOFINTANGIBLES",
        "BOOKVALUEOUTSHARESFISCAL",
        "BOOKVALUEPERSHARE",
        "CAPITALEXPENDITURES",
        "CASH",
        "CASHGENERIC",
        "CASHSHORTTERMINVESTMENTS",
        "CASHDIVIDENDSPAIDTOTAL",
        "CASHFLOWPERSHARE",
        "CASHFLOWPERSHAREFISYR",
        "CASHFLOWSALES",
        "CLOSELYHELDSHARES",
        "COMMONDIVIDENDSCASH",
        "COMMONSHAREHOLDERSEQUITY",
        "COMMONSHARESOUTSTANDING",
        "SHARESUSEDFORBASICEPS",
        "COMMONSHARESTOCALCEPS",
        "AVGFULLYDILUTEDSHARESOUTS",
        "COMMONSTOCK",
        "COMPFDPURCHASEDRETIREDCO",
        "COSTOFGOODSSOLDEXCLDEP",
        "CURRENCYOFDOCUMENTISO",
        "CURRENTASSETSTOTAL",
        "CURRENTLIABILITIESTOTAL",
        "CURRENTRATIO",
        "DATAUPDATEINDICATOR",
        "FISCALPERIODENDDATE",
        "DEFERREDTAXES",
        "DEPRECIATIONANDDEPLETION",
        "DEPRECIATIONDEPLETIONAMORT",
        "DIVIDENDPAYOUTPERSHARE",
        "DIVIDENDSPERSHARE",
        "DIVIDENDSPERSHAREFISCAL",
        "DIVIDENDSPROVIDEDPAIDCOMMON",
        "DVFAEARNINGSPERSHARE",
        "EARNINGSBEFINTERESTTAXES",
        "EBITDEPRECIATION",
        "EARNINGSPERSHARE",
        "EARNINGSPERSHAREASREPORTED",
        "FISCALEPSBASICYRE",
        "EPSBASICYEAR",
        "EARNINGSPERSHAREFISCALEND",
        "FISCALEPSFULLYDILUTEDYR",
        "EPSINCLUDINGEXTRAITEMS",
        "EMPLOYEES",
        "ENTERPRISEVALUE",
        "EXTRAITEMSGAINLOSSSALEO",
        "EXTRAORDINARYCHARGEPRETAX",
        "EXTRAORDINARYCREDITPRETAX",
        "EPSFULLYDILUTEDSHARESYR",
        "FUNDSFROMOPERATIONS",
        "GROSSINCOME",
        "IMPAIRMENTOFGOODWILL",
        "IMPAIRMENTOTHERINTANGIBLES",
        "INCOMETAXES",
        "INCREASEDECREASEINCASHSHOR",
        "INDRESTATEDDATAEXISTS",
        "INTERESTCAPITALIZED",
        "INTERESTEXPENSEONDEBT",
        "INVENTORIESDAYSHELD",
        "TOTALINVENTORIES",
        "LENGTHOFFISCALYEAR",
        "LONGTERMDEBT",
        "MARKETCAPITALIZATION",
        "MINORITYINTERESTBALANCESHEET",
        "MINORITYINTERESTINCOMESTATEME",
        "NETCASHFLOWFINANCING",
        "NETCASHFLOWINVESTING",
        "NETCASHFLOWOPERATINGACTIVS",
        "NETDEBT",
        "NETINCOMEBASIC",
        "NETINCOMEAVAILABLETOCOMMON",
        "NETINCBEFOREEXTRAPFDDIVS",
        "NETINCOMEBEFOREPREFERREDDI",
        "NETINCOMEDILUTED",
        "NETMARGIN",
        "NETPROCEEDSFROMSALEISSUEC",
        "NETSALESORREVENUES",
        "NONOPERATINGINTERESTINCOME",
        "OPERATINGEXPENSESTOTAL",
        "OPERATINGINCOME",
        "OPERATINGPROFITMARGIN",
        "OTHERINVESTMENTS",
        "PARVALUE",
        "PREFERREDDIVIDENDREQUIREMENT",
        "PREFERREDSTOCK",
        "PRETAXINCOME",
        "PRETAXMARGIN",
        "PROPERTYPLANTEQUIPNET",
        "QUICKRATIO",
        "RECEIVABLESNET",
        "RESEARCHDEVELOPMENT",
        "RETAINEDEARNINGS",
        "RETURNONEQUITYTOTAL",
        "RETURNONINVESTEDCAPITAL",
        "SALARIESBENEFITSEXPENSES",
        "SALESPERSHARE",
        "SELLINGGENERALADMINISTRAT",
        "SHORTTERMDEBTCURRENTPORT",
        "STOCKSPLITDIVIDENDRATIO",
        "STOCKSPLITDIVRATIOFISCAL",
        "TOTALASSETS",
        "TOTALCAPITAL",
        "TOTALDEBT",
        "TOTALDEBTCOMMONEQUITY",
        "TOTALDEBTTOTALCAPITALSTD",
        "TOTALINTANGIBLEOTASSETSNET",
        "TOTALLIABILITIES",
        "TOTALLIABILITIESSHAREHOLDE",
        "TOTALSHAREHOLDERSEQUITY",
        "WORKINGCAPITAL",
        "INACTIVEDATE",
        "MARKETVALUE",
        "MARKETVALUEBYCOMPANY",
        "MARKETCAPITALIZATIONUS",
        "NETSALESORREVENUESUS",
        "TOTALASSETSUS",
        "CAPITALEXPENDTTOTALASSETS",
        "CLOSELYHELDSHARES_ABSOLUT",
        "DIVRATEUNADJUSTEDExDate",
        "DIVIDENDYIELD",
        "DIVIDENDTYPE",
        "DIVIDENDTAXINDICATOR",
        "DIVRATEUNADJUSTED",
        "DIVPERSHR",
        "EXDIVIDDATE",
        "DIVPAYDATE",
        "MFWDDIVIDENDYIELD",
        "CASHDIVIDENDCOVERAGERATIO",
        "CASHDIVCOVERRATIO5YR",
        "FOREIGNSALESTOTSALES5YR",
        "FOREIGNSALESTOTALSALES",
        "INTERNATIONALSALES",
        "INTERNATIONALASSETS",
        "INTERNATIONALOPERATINGINCOME",
        "FOREIGNASSETSTOTALASSETS",
        "FOREIGNINCOMETOTALINCOME",
        "WC04355",
        "X(WC04355)__E"
    ]

STATA_LABELS = [
        "Year",
        "Accounting method for investment greater than 50% (WC07531)",
        "Accounting standards followed (WC07536)",
        "Accounts Payable (WC03040E)",
        "Accounts receivable ? days (WC08131)",
        "Amortization & Impairment Of Goodwill (WC18224E)",
        "Amortization Of Deferred Charges (WC01150E)",
        "Amortization Of Intangible Assets (WC04050E)",
        "Amortization Of Intangibles (WC01149E)",
        "Book Value Outstanding Shares Fiscal (WC05491E)",
        "Book Value Per Share (WC05476E)",
        "Capital Expenditures (Additions To Fixed Assets) (WC04601E)",
        "Cash (WC02003E)",
        "Cash & Equivalents Generic (WC02005E)",
        "Cash & Short Term Investments (WC02001E)",
        "Cash Dividends Paid Total (WC04551E)",
        "Cash Flow Per Share (Security) (WC05501E)",
        "Cash Flow Per Share Fiscal (WC05502E)",
        "Cash Flow/Sales (WC08311)",
        "Closely-Held Shares (%) (WC08021)",
        "Common Dividends (Cash) (WC05376E)",
        "Common Equity (WC03501E)",
        "Common Shares Outstanding (WC05301)",
        "Common Shares Used To Calculate Basic EPS (WC05192)",
        "Common Shares Used To Calculate EPS (WC05191)",
        "Common Shares Used To Calculate Fully Diluted EPS (WC05194)",
        "Common Stock (WC03480E)",
        "Common/Preferred Purchased",
        "Cost Of Goods Sold (Excl Depreciation)  (WC01051E)",
        "Currency Of Document (WC06099)",
        "Current Assets Total (WC02201E)",
        "Current Liabilities Total (WC03101E)",
        "Current Ratio (WC08106)",
        "Data Update Indicator (WC07034)",
        "Date Of Fiscal Year End (WC05350)",
        "Deferred Taxes (WC03263E)",
        "Depreciation And Depletion (WC04049E)",
        "Depreciation",
        "Dividend Payout Per Share (WC09504)",
        "Dividends Per Share (WC05101E)",
        "Dividends Per Share Fiscal (WC05110E)",
        "Dividends Provided For Or Paid Common (WC18192E)",
        "DVFA Earnings Per Share (WC05240E)",
        "Earnings Before Interest And Taxes (EBIT) (WC18191E)",
        "Earnings Before Interest",
        "Earnings Per Share (WC05201E)",
        "Earnings Per Share As Reported (WC18193E)",
        "Earnings Per Share Basic Fiscal (WC10010E)",
        "Earnings Per Share Basic Year (WC05210E)",
        "Earnings Per Share Fiscal Year End (WC05202E)",
        "Earnings Per Share Fully Diluted Fiscal (WC10030E)",
        "Earnings Per Share Including Extraordinary Items Fiscal (WC18209E)",
        "Employees (WC07011)",
        "Enterprise Value (WC18100E)",
        "Extra Items & Gain/Loss Sale Of Assets (WC01601E)",
        "Extraordinary Charge Pretax (WC01254E)",
        "Extraordinary Credit Pretax (WC01253E)",
        "Fully Diluted Earnings Per Share Year (WC05290E)",
        "Funds From Operations (WC04201E)",
        "Gross Income (WC01100E)",
        "Impairment Of Goodwill (WC18225E)",
        "Impairment Of Other Intangibles (WC18226E)",
        "Income Taxes (WC01451E)",
        "Increase/Decrease In Cash & Short Term Investments (WC04851E)",
        "Indicator Restated Data Exists (WC11556)",
        "Interest Capitalized (WC01255E)",
        "Interest Expense On Debt (WC01251E)",
        "Inventories Days Held (WC08126)",
        "Inventories Total (WC02101E)",
        "Length Of Fiscal Year (WC05351)",
        "Long Term Debt (WC03251E)",
        "Market Capitalization (WC08001E)",
        "Minority Interest Balance Sheet (WC03426E)",
        "Minority Interest Income Statement (WC01501E)",
        "Net Cash Flow Financing (WC04890E)",
        "Net Cash Flow Investing (WC04870E)",
        "Net Cash Flow Operating Activities (WC04860E)",
        "Net Debt (WC18199E)",
        "Net Income After Preferred Dividends (Basic EPS) (WC01706E)",
        "Net Income Available To Common (WC01751E)",
        "Net Income Before Extra Items/Preferred Dividends (WC01551E)",
        "Net Income Before Preferred Dividends (WC01651E)",
        "Net Income Used To Calculate Fully Diluted EPS (WC01705E)",
        "Net Margin (WC08366)",
        "Net Proceeds From Sale/Issue Of Common & Preferred (WC04251E)",
        "Net Sales Or Revenues (WC01001E)",
        "Non-Operating Interest Income (WC01266E)",
        "Operating Expenses Total (WC01249E)",
        "Operating Income (WC01250E)",
        "Operating Profit Margin (WC08316)",
        "Other Investments (WC02250E)",
        "Par Value (WC05309E)",
        "Preferred Dividend Requirements (WC01701E)",
        "Preferred Stock (WC03451E)",
        "Pretax Income (WC01401E)",
        "Pretax Margin (WC08321)",
        "Property",
        "Quick Ratio (WC08101)",
        "Receivables (Net) (WC02051E)",
        "Research & Development (WC01201E)",
        "Retained Earnings (WC03495E)",
        "Return On Equity Total % (WC08301)",
        "Return On Invested Capital (WC08376)",
        "Salaries And Benefits Expenses (WC01084E)",
        "Sales Per Share (WC05508E)",
        "Selling",
        "Short Term Debt & Current Portion Of Long Term Debt (WC03051E)",
        "Stock Split/Dividend Ratio (WC05576)",
        "Stock Split/Dividend Ratio Fiscal (WC05575)",
        "Total Assets (WC02999E)",
        "Total Capital (WC03998E)",
        "Total Debt (WC03255E)",
        "Total Debt % Common Equity (WC08231)",
        "Total Debt % Total Capital (WC08221)",
        "Total Intangible Other Assets Net (WC02649E)",
        "Total Liabilities (WC03351E)",
        "Total Liabilities & Shareholders' Equity (WC03999E)",
        "Total Shareholders Equity (WC03995E)",
        "Working Capital (WC03151E)",
        "Inactive Date (Security) (WC07015)",
        "Market Value (Capital) (MVE)",
        "Market Value For Company (MVCE)",
        "Market Capitalization (U.S.$) (WC07210)",
        "Net Sales Or Revenues (U.S.$) (WC07240)",
        "Total Assets (U.S.$) (WC07230)",
        "Capital Expenditure % Total Assets (WC08416)",
        "Closely Held Shares (WC05475)",
        "Dividend :  Unadjusted Rate (Ex Date) (UDDEE)",
        "Dividend Yield (DY)",
        "Dividend Type (DT)",
        "Dividend Tax Marker (DTAX)",
        "Dividend Rate - Unadjusted (UDDE)",
        "Dividend Per Share (DPSE)",
        "Date - Ex Dividend (XDD)",
        "Date - Dividend Payment (PYD)",
        "12M Forward Dividend Yield  (354E)",
        "Cash Dividend Coverage Ratio (WC08246)",
        "Cash Dividend Coverage Ratio 5 Year Average (WC08250)",
        "Foreign Sales % Total Sales 5 Year Average (WC08735)",
        "Foreign Sales % Total Sales  (WC08731)",
        "International Sales (WC07101E)",
        "International Assets (WC07151E)",
        "International Operating Income (WC07126E)",
        "Foreign Assets % Total Assets (WC08736)",
        "Foreign Income % Total Income (WC08741)",
        "Net Assets from Aquisitions USD (WC04355)",
        "Net Assets from Aquisitions (WC04355)" 
    ]

NON_FLOAT_VARS = ['year', 'ACCOUNTINGMETHODFORLONGTER', 'ACCOUNTINGSTANDARDSFOLLOWED',
                  'CURRENCYOFDOCUMENTISO', 'DATAUPDATEINDICATOR', 'FISCALPERIODENDDATE',
                  'INDRESTATEDDATAEXISTS', 'INACTIVEDATE', 'DIVIDENDTYPE', 'DIVIDENDTAXINDICATOR',
                  'EXDIVIDDATE', 'DIVPAYDATE']

VARIABLE_DICT = {}
LABEL_DICT = {}

for i in range(len(WS_CODES)):
    VARIABLE_DICT[WS_CODES[i]] = STATA_VARS[i]
    LABEL_DICT[STATA_VARS[i]] = STATA_LABELS[i]

### Script Start

In [4]:
files = os.listdir(RAW_DATA_DIR)

print(files)

chunks = []

for f in files:
    if f == 'DFOIndex.xlsm' or f == 'DFOIndexReversed.xlsm' or f == 'temp.csv' or f == '~$DFOIndexReversed.xlsm':
        pass

    else:
        print(f)
        df = pd.read_excel(f'{RAW_DATA_DIR}{f}', sheet_name = 'Tabelle1', na_values = NA_VALUES, header = None)
        df.to_csv(f'{RAW_DATA_DIR}temp.csv', header = None, index = False)
            
        for chunk in pd.read_csv(f'{RAW_DATA_DIR}temp.csv', header = None, chunksize = CHUNK_SIZE):
            dscd = chunk.iloc[0, 1]
            chunk.columns = chunk.iloc[1]
            chunk = chunk[2:]
            chunk['DSCD'] = dscd
            
            ### Remove Duplicate Column Names
            chunk = chunk.T
            chunk['cols'] = chunk.index
            chunk = chunk.drop_duplicates(subset = 'cols').drop(labels = 'cols', axis = 1).T
            chunks.append(chunk)

df = pd.concat(chunks)
df = df.rename(columns = VARIABLE_DICT)
df = df.dropna(subset = ['year', 'DSCD'])
df.head()

['DFOIndex.xlsm', 'DFOIndexReversed.xlsm', 'temp.csv', 'TR_1.xlsm', 'TR_10.xlsm', 'TR_11.xlsm', 'TR_12.xlsm', 'TR_13.xlsm', 'TR_14.xlsm', 'TR_15.xlsm', 'TR_16.xlsm', 'TR_17.xlsm', 'TR_18.xlsm', 'TR_19.xlsm', 'TR_2.xlsm', 'TR_20.xlsm', 'TR_21.xlsm', 'TR_22.xlsm', 'TR_23.xlsm', 'TR_24.xlsm', 'TR_25.xlsm', 'TR_26.xlsm', 'TR_27.xlsm', 'TR_28.xlsm', 'TR_29.xlsm', 'TR_3.xlsm', 'TR_30.xlsm', 'TR_31.xlsm', 'TR_32.xlsm', 'TR_33.xlsm', 'TR_34.xlsm', 'TR_35.xlsm', 'TR_36.xlsm', 'TR_37.xlsm', 'TR_38.xlsm', 'TR_39.xlsm', 'TR_4.xlsm', 'TR_5.xlsm', 'TR_6.xlsm', 'TR_7.xlsm', 'TR_8.xlsm', 'TR_9.xlsm']
TR_1.xlsm
TR_10.xlsm
TR_11.xlsm
TR_12.xlsm
TR_13.xlsm
TR_14.xlsm
TR_15.xlsm
TR_16.xlsm
TR_17.xlsm
TR_18.xlsm
TR_19.xlsm
TR_2.xlsm
TR_20.xlsm
TR_21.xlsm
TR_22.xlsm
TR_23.xlsm
TR_24.xlsm
TR_25.xlsm
TR_26.xlsm
TR_27.xlsm
TR_28.xlsm
TR_29.xlsm
TR_3.xlsm
TR_30.xlsm
TR_31.xlsm
TR_32.xlsm
TR_33.xlsm
TR_34.xlsm
TR_35.xlsm
TR_36.xlsm
TR_37.xlsm
TR_38.xlsm
TR_39.xlsm
TR_4.xlsm
TR_5.xlsm
TR_6.xlsm
TR_7.xlsm
TR_8.xls

Unnamed: 0,year,ACCOUNTINGMETHODFORLONGTER,ACCOUNTINGSTANDARDSFOLLOWED,ACCOUNTSPAYABLE,ACCOUNTSRECEIVABLESDAYS,AMORTIMPAIROFGOODWILL,AMORTIZATIONOFDEFERREDCHARG,AMORTIZATIONINTANGIBLEASSETS,AMORTIZATIONOFINTANGIBLES,BOOKVALUEOUTSHARESFISCAL,...,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,2020.0,All subsidiaries are consolidated,NaN.7
2,2021,,,,,,,,,,...,,,,,,,,,,
3,2020,,,,,,,,,,...,,,,,,,,,,
6,2021,,,,,,,,,,...,,,,,,,,,,
7,2020,,,,,,,,,,...,,,,,,,,,,
10,2021,All subsidiaries are consolidated,US standards (GAAP),4327305.0,97.0,0.0,0.0,279636.0,279636.0,6.995,...,,,,,,,,,,


In [5]:
df = df.T
df['cols'] = df.index
df = df.drop_duplicates(subset = 'cols').drop(labels = 'cols', axis = 1).T
df = df.dropna(subset = ['DSCD'])
df = df.drop(labels = [2020, 'All subsidiaries are consolidated'],axis = 1)
df.head()

Unnamed: 0,year,ACCOUNTINGMETHODFORLONGTER,ACCOUNTINGSTANDARDSFOLLOWED,ACCOUNTSPAYABLE,ACCOUNTSRECEIVABLESDAYS,AMORTIMPAIROFGOODWILL,AMORTIZATIONOFDEFERREDCHARG,AMORTIZATIONINTANGIBLEASSETS,AMORTIZATIONOFINTANGIBLES,BOOKVALUEOUTSHARESFISCAL,...,FOREIGNSALESTOTALSALES,INTERNATIONALSALES,INTERNATIONALASSETS,INTERNATIONALOPERATINGINCOME,FOREIGNASSETSTOTALASSETS,FOREIGNINCOMETOTALINCOME,WC04355,X(WC04355)~E,DSCD,NaN
2,2021,,,,,,,,,,...,,,,,,,,,41436T,
3,2020,,,,,,,,,,...,,,,,,,,,41436T,
6,2021,,,,,,,,,,...,,,,,,,,,276693,
7,2020,,,,,,,,,,...,,,,,,,,,276693,
10,2021,All subsidiaries are consolidated,US standards (GAAP),4327305.0,97.0,0.0,0.0,279636.0,279636.0,6.995,...,63.62,16194287.0,1995256.0,0.0,5.81,0.0,241001.0,211926.0,51363H,


In [6]:
### Fix DataTypes
df = df.astype({'year': 'int'})

for var in STATA_VARS:
    print(var)
    if var not in NON_FLOAT_VARS:
        df = df.astype({var: 'float'})

df.head()

year
ACCOUNTINGMETHODFORLONGTER
ACCOUNTINGSTANDARDSFOLLOWED
ACCOUNTSPAYABLE
ACCOUNTSRECEIVABLESDAYS
AMORTIMPAIROFGOODWILL
AMORTIZATIONOFDEFERREDCHARG
AMORTIZATIONINTANGIBLEASSETS
AMORTIZATIONOFINTANGIBLES
BOOKVALUEOUTSHARESFISCAL
BOOKVALUEPERSHARE
CAPITALEXPENDITURES
CASH
CASHGENERIC
CASHSHORTTERMINVESTMENTS
CASHDIVIDENDSPAIDTOTAL
CASHFLOWPERSHARE
CASHFLOWPERSHAREFISYR
CASHFLOWSALES
CLOSELYHELDSHARES
COMMONDIVIDENDSCASH
COMMONSHAREHOLDERSEQUITY
COMMONSHARESOUTSTANDING
SHARESUSEDFORBASICEPS
COMMONSHARESTOCALCEPS
AVGFULLYDILUTEDSHARESOUTS
COMMONSTOCK
COMPFDPURCHASEDRETIREDCO
COSTOFGOODSSOLDEXCLDEP
CURRENCYOFDOCUMENTISO
CURRENTASSETSTOTAL
CURRENTLIABILITIESTOTAL
CURRENTRATIO
DATAUPDATEINDICATOR
FISCALPERIODENDDATE
DEFERREDTAXES
DEPRECIATIONANDDEPLETION
DEPRECIATIONDEPLETIONAMORT
DIVIDENDPAYOUTPERSHARE
DIVIDENDSPERSHARE
DIVIDENDSPERSHAREFISCAL
DIVIDENDSPROVIDEDPAIDCOMMON
DVFAEARNINGSPERSHARE
EARNINGSBEFINTERESTTAXES
EBITDEPRECIATION
EARNINGSPERSHARE
EARNINGSPERSHAREASREPORTED
FISCALEPSBASI

Unnamed: 0,year,ACCOUNTINGMETHODFORLONGTER,ACCOUNTINGSTANDARDSFOLLOWED,ACCOUNTSPAYABLE,ACCOUNTSRECEIVABLESDAYS,AMORTIMPAIROFGOODWILL,AMORTIZATIONOFDEFERREDCHARG,AMORTIZATIONINTANGIBLEASSETS,AMORTIZATIONOFINTANGIBLES,BOOKVALUEOUTSHARESFISCAL,...,FOREIGNSALESTOTALSALES,INTERNATIONALSALES,INTERNATIONALASSETS,INTERNATIONALOPERATINGINCOME,FOREIGNASSETSTOTALASSETS,FOREIGNINCOMETOTALINCOME,WC04355,X(WC04355)~E,DSCD,NaN
2,2021,,,,,,,,,,...,,,,,,,,,41436T,
3,2020,,,,,,,,,,...,,,,,,,,,41436T,
6,2021,,,,,,,,,,...,,,,,,,,,276693,
7,2020,,,,,,,,,,...,,,,,,,,,276693,
10,2021,All subsidiaries are consolidated,US standards (GAAP),4327305.0,97.0,0.0,0.0,279636.0,279636.0,6.995,...,63.62,16194287.0,1995256.0,0.0,5.81,0.0,241001.0,211926.0,51363H,


In [9]:
df = df.iloc[: , :-1]
df

Unnamed: 0,year,ACCOUNTINGMETHODFORLONGTER,ACCOUNTINGSTANDARDSFOLLOWED,ACCOUNTSPAYABLE,ACCOUNTSRECEIVABLESDAYS,AMORTIMPAIROFGOODWILL,AMORTIZATIONOFDEFERREDCHARG,AMORTIZATIONINTANGIBLEASSETS,AMORTIZATIONOFINTANGIBLES,BOOKVALUEOUTSHARESFISCAL,...,FOREIGNSALESTOTSALES5YR,FOREIGNSALESTOTALSALES,INTERNATIONALSALES,INTERNATIONALASSETS,INTERNATIONALOPERATINGINCOME,FOREIGNASSETSTOTALASSETS,FOREIGNINCOMETOTALINCOME,WC04355,X(WC04355)~E,DSCD
2,2021,,,,,,,,,,...,,,,,,,,,,41436T
3,2020,,,,,,,,,,...,,,,,,,,,,41436T
6,2021,,,,,,,,,,...,,,,,,,,,,276693
7,2020,,,,,,,,,,...,,,,,,,,,,276693
10,2021,All subsidiaries are consolidated,US standards (GAAP),4327305.0,97.0,0.0,0.0,279636.0,279636.0,6.995,...,63.65,63.62,16194287.0,1995256.0,0.0,5.81,0.0,241001.0,211926.0,51363H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7991,2020,,,,,,,,,,...,,,,,,,,,,775700
7994,2021,,,,,,,,,,...,,,,,,,,,,295909
7995,2020,,,,,,,,,,...,,,,,,,,,,295909
7998,2021,,,,,,,,,,...,,,,,,,,,,936870


In [11]:
### Export to Stat Docs
for i, year in enumerate(YEARS):
    print(i, year)
    df_output = df.loc[df['year'] == year]
    df_output.to_stata(f'{OUTPUT_DIR[i]}TR_{year}_Accounting.dta', write_index = False, variable_labels = LABEL_DICT, version = 118)

try:
    os.remove(f'{RAW_DATA_DIR}temp.csv')
    
except:
    pass

0 2020
1 2021
