# Ownership growth preprocessor
The programm preprocesses US stocks ownership data. It calculates both insider and institutional ownership growth (%) over a specified time period.

Data from gurufocus.com

### 1. Imports

In [16]:
import pandas as pd

### 2. Main function

In [14]:
def calculate_ownership_growth(inst_ownrshp_filename, insdr_ownrshp_filename, period_length):
    inst_ownrshp_data = pd.read_csv(inst_ownrshp_filename)
    insdr_ownrshp_data = pd.read_csv(insdr_ownrshp_filename)
    
    print(f"Number of rows for institutional ownership: {len(inst_ownrshp_data)}")
    print(f"Number of rows for insider ownership: {len(insdr_ownrshp_data)}")

    # Merge the data by ticker and date
    merged_data = pd.merge(inst_ownrshp_data, insdr_ownrshp_data, 
                       left_on=['symbol', 'inst_ownrshp_date'],
                       right_on=['symbol', 'insdr_ownrshp_date'])
    print(f"Number of rows after merge: {len(merged_data)}")
    
    merged_data['date'] = pd.to_datetime(merged_data['inst_ownrshp_date'])
    merged_data['Year'] = merged_data['date'].dt.year
    merged_data['Month'] = merged_data['date'].dt.month
    merged_data = merged_data.sort_values(['symbol', 'date'])

    # Calculate ownership growth
    merged_data['Insider_growth'] = merged_data.groupby('symbol')['insdr_ownrshp'].pct_change(periods=period_length)
    merged_data['Institutional_growth'] = merged_data.groupby('symbol')['inst_ownrshp'].pct_change(periods=period_length)
    # Shift the data by one month to reflect when the ownership data was actually publicly available
    merged_data['Insider_growth'] = merged_data.groupby('symbol')['Insider_growth'].shift(1)
    merged_data['Institutional_growth'] = merged_data.groupby('symbol')['Institutional_growth'].shift(1)
    
    result_data = merged_data[['symbol', 'Year', 'Month', 'Insider_growth', 'Institutional_growth']].rename(columns={'symbol': 'Ticker'})
    result_data = result_data.dropna() 
    print(f"Final number of rows: {len(result_data)}")

    return result_data    

### 3. Data and programm execution

In [15]:
inst_ownrshp_filename = 'datasets/inst_ownership.csv'
insdr_ownrshp_filename = 'datasets/insdr_ownership.csv'
period_length = 2 # 2 months
output_filename = 'datasets/ownership_growth.csv'

ownership_growth_data = calculate_ownership_growth(inst_ownrshp_filename, insdr_ownrshp_filename, period_length)
ownership_growth_data.to_csv(output_filename, index=False)

Number of rows for institutional ownership: 233
Number of rows for insider ownership: 239
Number of rows after merge: 227
    symbol inst_ownrshp_date  inst_ownrshp  inst_ownrshp_pct  \
0     AAPL        2024-05-31       5530.18             36.06   
1     AAPL        2024-04-30       5482.57             35.75   
2     AAPL        2024-03-31       5471.89             35.68   
3     AAPL        2024-02-29       5490.11             35.55   
4     AAPL        2024-01-31       5489.88             35.55   
..     ...               ...           ...               ...   
222   AAPL        2005-05-31        234.38              4.06   
223   AAPL        2005-04-30        227.94              3.95   
224   AAPL        2005-03-31        216.34              3.75   
225   AAPL        2005-02-28        103.35              1.81   
226   AAPL        2005-01-31         47.56              0.83   

    insdr_ownrshp_date  insdr_ownrshp  insdr_ownrshp_pct  
0           2024-05-31          16.88             