In [21]:
import os
import pandas as pd
import jalali_pandas
import math
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore")

desktop = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop') 
data_path = os.path.join(desktop, "sector_slc", "dataset")
dataset_path = os.path.join(data_path, "dataset")
dataset_df = pd.read_csv(dataset_path + "\\" + "dataset.csv")
dataset_df = dataset_df.drop("Unnamed: 0",axis=1)
list_of_sectors = dataset_df.sector.unique().tolist()

In [36]:
year_to_predict = 1396
month_to_predict = 1

last_year = 1401
last_year_month = 3

target = "excess_next_6m_return"

list_of_features = [
        'P/E-ttm','market_pe', 'diff_nima_usd', 'inflation',
        'change_in_money_supply', 'last_6m_return','last_3m_return', 'last_1m_return',
        'last_6m_usd_return', 'last_3m_usd_return', 'last_1m_usd_return',
        'last_6m_index_return', 'last_3m_index_return', 'last_1m_index_return',
        'relative_trade_value','market_relative_trade_value']

In [37]:
first_occurence = dataset_df[(dataset_df.month == month_to_predict) & (dataset_df.year == year_to_predict)].iloc[0].name
train_set = dataset_df[:first_occurence]

regressor = DecisionTreeRegressor(random_state = 0, max_depth=5, criterion="absolute_error", min_samples_split=300, min_samples_leaf= 150)
X = train_set[list_of_features]
y = train_set[target]
regressor.fit(X, y)

result_df = pd.DataFrame(columns=["sector", "predicted_excess_return","realized_excess_return"])
for sector in list_of_sectors:
    row = dataset_df[(dataset_df.month == month_to_predict) & (dataset_df.year == year_to_predict) & (dataset_df.sector == sector)].iloc[0]
    row_input = row[list_of_features]
    predicted_excess_return = regressor.predict([row_input])
    realized_excess_return = row[target] 
    new_row = [sector, predicted_excess_return[0]*100, realized_excess_return*100]
    result_df.loc[len(result_df)] = new_row
result_df = result_df.sort_values(by="predicted_excess_return",ascending=False).reset_index().drop("index",axis=1)
long_leg_return = result_df.realized_excess_return.iloc[:3].mean()
short_leg_return = result_df.realized_excess_return.iloc[-3:].mean()
#index_return = row.next_3m_index_return * 100
print(long_leg_return,short_leg_return)
result_df

22.326481290367457 -4.919989583427199


Unnamed: 0,sector,predicted_excess_return,realized_excess_return
0,محصولات لبنی,24.185816,-1.868183
1,پیمانکاری صنعتی,24.185816,33.803615
2,نرم افزار و خدمات,24.185816,35.044012
3,کاشی و سرامیک,24.185816,4.502787
4,محصولات پاک کننده,24.185816,-0.907015
5,سایر مواد معدنی,24.185816,-35.713495
6,لاستیک و پلاستیک,24.185816,9.399609
7,حمل و نقل بار زمینی,24.185816,-26.547081
8,مواد شیمیایی-متنوع,24.185816,0.561912
9,محصولات کشاورزی,5.918097,-9.095401


In [38]:
performance_df = pd.DataFrame(columns=["year", "month","long_leg_return","short_leg_return"])

for year in range(year_to_predict, last_year + 1):

    if (year == year_to_predict) and (year_to_predict != last_year):
        for month in range(month_to_predict, 13):
            first_occurence = dataset_df[(dataset_df.month == month) & (dataset_df.year == year)].iloc[0].name
            train_set = dataset_df[:first_occurence]

            regressor = DecisionTreeRegressor(random_state = 0, max_depth=5, criterion="absolute_error", min_samples_split=300, min_samples_leaf= 150)
            X = train_set[list_of_features]
            y = train_set[target]
            regressor.fit(X, y)

            result_df = pd.DataFrame(columns=["sector", "predicted_excess_return","realized_excess_return"])
            for sector in list_of_sectors:
                row = dataset_df[(dataset_df.month == month) & (dataset_df.year == year) & (dataset_df.sector == sector)].iloc[0]
                row_input = row[list_of_features]
                predicted_excess_return = regressor.predict([row_input])
                realized_excess_return = row[target] 
                new_row = [sector, predicted_excess_return[0]*100, realized_excess_return*100]
                result_df.loc[len(result_df)] = new_row
            result_df = result_df.sort_values(by="predicted_excess_return",ascending=False).reset_index().drop("index",axis=1)
            long_leg_return = result_df.realized_excess_return.iloc[:3].mean()
            short_leg_return = result_df.realized_excess_return.iloc[-3:].mean()
            #index_return = row.index_monthly_return * 100
            new_row = [year, month, long_leg_return, short_leg_return]
            performance_df.loc[len(performance_df)] = new_row

    elif year == last_year:
        for month in range(1, last_year_month + 1):
            first_occurence = dataset_df[(dataset_df.month == month) & (dataset_df.year == year)].iloc[0].name
            train_set = dataset_df[:first_occurence]

            regressor = DecisionTreeRegressor(random_state = 0, max_depth=5, criterion="absolute_error", min_samples_split=300, min_samples_leaf= 150)
            X = train_set[list_of_features]
            y = train_set[target]
            regressor.fit(X, y)

            result_df = pd.DataFrame(columns=["sector", "predicted_excess_return","realized_excess_return"])
            for sector in list_of_sectors:
                row = dataset_df[(dataset_df.month == month) & (dataset_df.year == year) & (dataset_df.sector == sector)].iloc[0]
                row_input = row[list_of_features]
                predicted_excess_return = regressor.predict([row_input])
                realized_excess_return = row[target] 
                new_row = [sector, predicted_excess_return[0]*100, realized_excess_return*100]
                result_df.loc[len(result_df)] = new_row
            result_df = result_df.sort_values(by="predicted_excess_return",ascending=False).reset_index().drop("index",axis=1)
            long_leg_return = result_df.realized_excess_return.iloc[:3].mean()
            short_leg_return = result_df.realized_excess_return.iloc[-3:].mean()
            #index_return = row.index_monthly_return * 100
            new_row = [year, month, long_leg_return, short_leg_return]
            performance_df.loc[len(performance_df)] = new_row

    else:
        for month in range(1, 13):
            first_occurence = dataset_df[(dataset_df.month == month) & (dataset_df.year == year)].iloc[0].name
            train_set = dataset_df[:first_occurence]

            regressor = DecisionTreeRegressor(random_state = 0, max_depth=5, criterion="absolute_error", min_samples_split=300, min_samples_leaf= 150)
            X = train_set[list_of_features]
            y = train_set[target]
            regressor.fit(X, y)

            result_df = pd.DataFrame(columns=["sector", "predicted_excess_return","realized_excess_return"])
            for sector in list_of_sectors:
                row = dataset_df[(dataset_df.month == month) & (dataset_df.year == year) & (dataset_df.sector == sector)].iloc[0]
                row_input = row[list_of_features]
                predicted_excess_return = regressor.predict([row_input])
                realized_excess_return = row[target] 
                new_row = [sector, predicted_excess_return[0]*100, realized_excess_return*100]
                result_df.loc[len(result_df)] = new_row
            result_df = result_df.sort_values(by="predicted_excess_return",ascending=False).reset_index().drop("index",axis=1)
            long_leg_return = result_df.realized_excess_return.iloc[:3].mean()
            short_leg_return = result_df.realized_excess_return.iloc[-3:].mean()
            #index_return = row.index_monthly_return * 100
            new_row = [year, month, long_leg_return, short_leg_return]
            performance_df.loc[len(performance_df)] = new_row

In [39]:
performance_df

Unnamed: 0,year,month,long_leg_return,short_leg_return
0,1396.0,1.0,22.326481,-4.919990
1,1396.0,2.0,6.427201,-12.377740
2,1396.0,3.0,-9.859288,-20.485358
3,1396.0,4.0,-15.571565,-21.162911
4,1396.0,5.0,-13.705781,-2.094443
...,...,...,...,...
58,1400.0,11.0,-7.922339,-11.988924
59,1400.0,12.0,-14.846050,-2.662880
60,1401.0,1.0,16.969145,-5.589762
61,1401.0,2.0,5.277657,22.777937


In [40]:
performance_df["long_leg_return"].gt(0).sum()

36