# Main Preprocessor
This is main data preprocessor for classification. The programm merges all the data. The assumption is that all the other data preprocessors were already executed.
Criteria for stocks to be classified as Buy are:
- beating SP500 index
- at least 2% Return on Investment

### 1. Imports

In [1]:
import pandas as pd

### 2. Helper function
This function calculates sp500 performance over 3 months to use it as a benchmark for classification

In [2]:
def process_sp500_data(sp500_data_filename):
    df = pd.read_csv(sp500_data_filename)

    df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
    df.set_index('Date', inplace=True)
    df.sort_index(inplace=True)
    
    df['Open'] = df['Open'].str.replace(',', '').astype(float)
    df['3M_market_performance'] = (df['Open'].shift(-3) - df['Open']) / df['Open']
    df['Month'] = df.index.month
    df['Year'] = df.index.year

    df = df.dropna(subset=['3M_market_performance'])
    return df

### 2. Main function

In [3]:
def merge_all_data(shareprices_filename, roe_filename, eps_filename, ownrshp_filename, companies_filename, industries_filename, sectors_perf_filename, market_perf_filename):
    shareprices_data = pd.read_csv(shareprices_filename)
    print(f"Number of rows for shareprices data: {len(shareprices_data)}")

    # Merge shareprices with ROE
    roe = pd.read_csv(roe_filename)
    print(f"Number of rows for ROE data: {len(roe)}")
    result = shareprices_data.merge(roe, on=['Ticker', 'Year', 'Month'], how='inner')

    # Merge with ownership growth data
    ownership_growth = pd.read_csv(ownrshp_filename)
    print(f"Number of rows for ownership growth data: {len(ownership_growth)}")
    result = result.merge(ownership_growth, on=['Ticker', 'Year', 'Month'], how='inner')

    # Merge with eps growth data
    eps_growth = pd.read_csv(eps_filename)
    print(f"Number of rows for EPS growth data: {len(eps_growth)}")
    result = result.merge(eps_growth, on=['Ticker', 'Year', 'Month'], how='inner')

    # Get sector for every ticker
    us_companies = pd.read_csv(companies_filename, sep=';', encoding='utf-8-sig')
    industries = pd.read_csv(industries_filename, sep=';', encoding='utf-8-sig')
    us_companies = us_companies.merge(industries, on='IndustryId')
    sectors_perf = pd.read_csv(sectors_perf_filename)
    us_companies = us_companies.merge(sectors_perf, on='Sector', how='inner')

    # Get market (SP500) performance
    sectors_perf = sectors_perf[sectors_perf['Sector'] == 'SP500']
    print(f"Number of rows for Sectors Performance data: {len(sectors_perf)}")
    us_companies = us_companies.merge(sectors_perf, on=['Year', 'Month'], how='inner')

    # Merge with sector and market performance
    result = result.merge(us_companies, on=['Ticker', 'Year', 'Month'], how='inner')

    # Get market performance over the next 3 months (as benchmark)
    sp500_perf = process_sp500_data(market_perf_filename)
    result = result.merge(sp500_perf, on=['Year', 'Month'], how='inner')
    # Buy only when ROI beats benchmark and ROI is higher than 2%
    result['Buy'] = ((result['Return on Investment'] > result['3M_market_performance']) & (result['Return on Investment'] >= 1.02)).astype(int)
    
    result = result[[
        'Ticker', 'Year', 'Month', 'MA Ratio', "Buy", 
        'Return on Equity', 'Insider_growth', 'Institutional_growth', 
        'Forecast Growth', 'Avg 2Q Growth', 'Avg 2Q Surprise', 
        'YoY Growth', 'Performance_x', 'Performance_y']]

    result = result.rename(columns={
        'Return on Equity': 'ROE',
        'Insider_growth': 'Insider Ownership Growth',
        'Institutional_growth': 'Institutional Ownership Growth',
        'Forecast Growth': 'Forecast EPS Growth',
        'Avg 2Q Growth': 'Avg 2Q EPS Growth',
        'Avg 2Q Surprise': 'Avg 2Q EPS Surprise',
        'YoY Growth': 'YoY EPS Growth',
        'Performance_x': 'Sector Performance',
        'Performance_y': 'Market Performance'
    })

    result = result.dropna()
    print(f"Result number of rows: {len(result)}")
    return result

### 3. Programm execution

In [4]:
shareprices_filename = '../datasets/shareprices_data.csv'
roe_filename = '../datasets/ROE.csv'
eps_filename = '../datasets/EPS_growth_data.csv'
ownrshp_filename = '../datasets/ownership_growth.csv'
companies_filename = '../datasets/us-companies.csv'
industries_filename = '../datasets/industries.csv'
sectors_perf_filename = '../datasets/Sectors_2Months_performance.csv'
market_perf_filename = '../datasets/S&P 500 Historical Data.csv'

output_filename = 'stocks_data.csv'

result = merge_all_data(shareprices_filename, roe_filename, eps_filename, ownrshp_filename, companies_filename, industries_filename, sectors_perf_filename, market_perf_filename)
print(result.head())
result.to_csv(output_filename)

Number of rows for shareprices data: 234306
Number of rows for ROE data: 229519
Number of rows for ownership growth data: 94044
Number of rows for EPS growth data: 41122
Number of rows for Sectors Performance data: 132
Result number of rows: 14854
  Ticker  Year  Month  MA Ratio  Buy        ROE  Insider Ownership Growth  \
0      A  2019      1  0.961995    1   6.913148                  0.100796   
1      A  2019      2  1.066554    1   6.913148                  0.095491   
2      A  2019      3  1.046990    0   6.913148                 -0.012048   
3      A  2019      4  1.020680    0  22.637014                  0.000000   
4      A  2019      5  0.983734    0  22.637014                  0.007317   

   Institutional Ownership Growth  Forecast EPS Growth  Avg 2Q EPS Growth  \
0                       -0.004951            -0.098765           0.119862   
1                        0.023857            -0.098765           0.119862   
2                        0.025465            -0.052632    