# Main Preprocessor
This is main data preprocessor. The programm merges all the data. The assumption is that all the other data preprocessors were already executed.

### 1. Imports

In [2]:
import pandas as pd

### 2. Main function

In [4]:
def merge_all_data(shareprices_filename, roe_filename, eps_filename, ownrshp_filename, companies_filename, industries_filename, sectors_perf_filename):
    shareprices_data = pd.read_csv(shareprices_filename)
    print(f"Number of rows for shareprices data: {len(shareprices_data)}")
    
    roe = pd.read_csv(roe_filename)
    print(f"Number of rows for ROE data: {len(roe)}")
    result = shareprices_data.merge(roe, on=['Ticker', 'Year', 'Month'], how='inner')

    ownership_growth = pd.read_csv(ownrshp_filename)
    print(f"Number of rows for ownership growth data: {len(ownership_growth)}")
    result = result.merge(ownership_growth, on=['Ticker', 'Year', 'Month'], how='inner')

    eps_growth = pd.read_csv(eps_filename)
    print(f"Number of rows for EPS growth data: {len(eps_growth)}")
    result = result.merge(eps_growth, on=['Ticker', 'Year', 'Month'], how='inner')

    us_companies = pd.read_csv(companies_filename, sep=';', encoding='utf-8-sig')
    industries = pd.read_csv(industries_filename, sep=';', encoding='utf-8-sig')
    us_companies = us_companies.merge(industries, on='IndustryId')
    sectors_perf = pd.read_csv(sectors_perf_filename)
    us_companies = us_companies.merge(sectors_perf, on='Sector', how='inner')

    sectors_perf = sectors_perf[sectors_perf['Sector'] == 'SP500']
    print(f"Number of rows for Sectors Performance data: {len(sectors_perf)}")
    us_companies = us_companies.merge(sectors_perf, on=['Year', 'Month'], how='inner')

    result = result.merge(us_companies, on=['Ticker', 'Year', 'Month'], how='inner')

    result = result[[
        'Ticker', 'Year', 'Month', 'MA Ratio', "Return on Investment", 
        'Return on Equity', 'Insider_growth', 'Institutional_growth', 
        'Forecast Growth', 'Avg 2Q Growth', 'Avg 2Q Surprise', 
        'YoY Growth', 'Performance_x', 'Performance_y']]

    result = result.rename(columns={
        'Return on Investment': 'Result',
        'Return on Equity': 'ROE',
        'Insider_growth': 'Insider Ownership Growth',
        'Institutional_growth': 'Institutional Ownership Growth',
        'Forecast Growth': 'Forecast EPS Growth',
        'Avg 2Q Growth': 'Avg 2Q EPS Growth',
        'Avg 2Q Surprise': 'Avg 2Q EPS Surprise',
        'YoY Growth': 'YoY EPS Growth',
        'Performance_x': 'Sector Performance',
        'Performance_y': 'Market Performance'
    })

    result = result.dropna()
    print(f"Result number of rows: {len(result)}")
    return result

### 3. Programm execution

In [5]:
shareprices_filename = 'datasets/shareprices_data.csv'
roe_filename = 'datasets/ROE.csv'
eps_filename = 'datasets/EPS_growth_data.csv'
ownrshp_filename = 'datasets/ownership_growth.csv'
companies_filename = 'datasets/us-companies.csv'
industries_filename = 'datasets/industries.csv'
sectors_perf_filename = 'datasets/Sectors_2Months_performance.csv'

output_filename = 'datasets/clean_stocks_data.csv'

result = merge_all_data(shareprices_filename, roe_filename, eps_filename, ownrshp_filename, companies_filename, industries_filename, sectors_perf_filename)
result.to_csv(output_filename)

Number of rows for shareprices data: 234306
Number of rows for ROE data: 229519
Number of rows for ownership growth data: 94044
Number of rows for EPS growth data: 44208
Number of rows for Sectors Performance data: 132
Result number of rows: 16012
