In [50]:
import os
import pandas as pd
import jalali_pandas
import math
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

desktop = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop') 
data_path = os.path.join(desktop, "sector_slc", "dataset")
dataset_path = os.path.join(data_path, "dataset")
dataset_df = pd.read_csv(dataset_path + "\\" + "dataset.csv")
dataset_df = dataset_df.drop("Unnamed: 0",axis=1)
list_of_sectors = dataset_df.sector.unique().tolist()

In [56]:
year_to_predict = 1396
month_to_predict = 1

last_year = 1401
last_year_month = 3

target = "excess_next_1m_return"

#regressor1 = DecisionTreeRegressor(random_state = 0, max_depth=5, criterion="absolute_error", min_samples_split=300, min_samples_leaf= 150)
#regressor = RandomForestRegressor(n_estimators = 100, random_state = 0,criterion="absolute_error",max_depth=4,min_samples_split=300,min_samples_leaf= 150)
#regressor2 = KNeighborsRegressor(n_neighbors=10)
#regressor3 = KernelRidge(alpha=1.0, kernel="rbf")
#regressor = VotingRegressor(estimators=[('gb', regressor1), ('rf', regressor2), ('lr', regressor3)])
regressor = SVR(kernel="rbf")

list_of_features = [
        'P/E-ttm','market_pe', 'diff_nima_usd', 'inflation',
        'change_in_money_supply', 'last_6m_return','last_3m_return', 'last_1m_return',
        'last_6m_usd_return', 'last_3m_usd_return', 'last_1m_usd_return',
        'last_6m_index_return', 'last_3m_index_return', 'last_1m_index_return',
        'relative_trade_value','market_relative_trade_value']

In [57]:
first_occurence = dataset_df[(dataset_df.month == month_to_predict) & (dataset_df.year == year_to_predict)].iloc[0].name
train_set = dataset_df[:first_occurence]


X = train_set[list_of_features]
y = train_set[target]
regressor.fit(X, y)

result_df = pd.DataFrame(columns=["sector", "predicted_excess_return","realized_excess_return"])
for sector in list_of_sectors:
    row = dataset_df[(dataset_df.month == month_to_predict) & (dataset_df.year == year_to_predict) & (dataset_df.sector == sector)].iloc[0]
    row_input = row[list_of_features]
    predicted_excess_return = regressor.predict([row_input])
    realized_excess_return = row[target] 
    new_row = [sector, predicted_excess_return[0]*100, realized_excess_return*100]
    result_df.loc[len(result_df)] = new_row
result_df = result_df.sort_values(by="predicted_excess_return",ascending=False).reset_index().drop("index",axis=1)
long_leg_return = result_df.realized_excess_return.iloc[:3].mean()
short_leg_return = result_df.realized_excess_return.iloc[-3:].mean()
#index_return = row.next_3m_index_return * 100
print(long_leg_return,short_leg_return)
result_df

21.352619335280846 -2.9538500736939834


Unnamed: 0,sector,predicted_excess_return,realized_excess_return
0,پیمانکاری صنعتی,10.455254,60.306543
1,تجهیزات مخابراتی,10.002033,-2.092235
2,فعالیت مهندسی,5.804041,5.843551
3,کاشی و سرامیک,5.671585,15.776192
4,سایر مواد معدنی,5.512602,-2.092235
5,سخت افزار و تجهیزات,4.072461,38.121947
6,محصولات پاک کننده,3.501352,2.380685
7,تجهیزات صنعتی,3.354799,31.426739
8,نرم افزار و خدمات,2.808779,17.362181
9,محصولات لبنی,2.263885,19.579105


In [58]:
performance_df = pd.DataFrame(columns=["year", "month","long_leg_return","short_leg_return"])

for year in range(year_to_predict, last_year + 1):

    if (year == year_to_predict) and (year_to_predict != last_year):
        for month in range(month_to_predict, 13):
            first_occurence = dataset_df[(dataset_df.month == month) & (dataset_df.year == year)].iloc[0].name
            train_set = dataset_df[:first_occurence]

            #regressor = DecisionTreeRegressor(random_state = 0, max_depth=5, criterion="absolute_error", min_samples_split=300, min_samples_leaf= 150)
            X = train_set[list_of_features]
            y = train_set[target]
            regressor.fit(X, y)

            result_df = pd.DataFrame(columns=["sector", "predicted_excess_return","realized_excess_return"])
            for sector in list_of_sectors:
                row = dataset_df[(dataset_df.month == month) & (dataset_df.year == year) & (dataset_df.sector == sector)].iloc[0]
                row_input = row[list_of_features]
                predicted_excess_return = regressor.predict([row_input])
                realized_excess_return = row[target] 
                new_row = [sector, predicted_excess_return[0]*100, realized_excess_return*100]
                result_df.loc[len(result_df)] = new_row
            result_df = result_df.sort_values(by="predicted_excess_return",ascending=False).reset_index().drop("index",axis=1)
            long_leg_return = result_df.realized_excess_return.iloc[:3].mean()
            short_leg_return = result_df.realized_excess_return.iloc[-3:].mean()
            #index_return = row.index_monthly_return * 100
            new_row = [year, month, long_leg_return, short_leg_return]
            performance_df.loc[len(performance_df)] = new_row

    elif year == last_year:
        for month in range(1, last_year_month + 1):
            first_occurence = dataset_df[(dataset_df.month == month) & (dataset_df.year == year)].iloc[0].name
            train_set = dataset_df[:first_occurence]

            #regressor = DecisionTreeRegressor(random_state = 0, max_depth=5, criterion="absolute_error", min_samples_split=300, min_samples_leaf= 150)
            X = train_set[list_of_features]
            y = train_set[target]
            regressor.fit(X, y)

            result_df = pd.DataFrame(columns=["sector", "predicted_excess_return","realized_excess_return"])
            for sector in list_of_sectors:
                row = dataset_df[(dataset_df.month == month) & (dataset_df.year == year) & (dataset_df.sector == sector)].iloc[0]
                row_input = row[list_of_features]
                predicted_excess_return = regressor.predict([row_input])
                realized_excess_return = row[target] 
                new_row = [sector, predicted_excess_return[0]*100, realized_excess_return*100]
                result_df.loc[len(result_df)] = new_row
            result_df = result_df.sort_values(by="predicted_excess_return",ascending=False).reset_index().drop("index",axis=1)
            long_leg_return = result_df.realized_excess_return.iloc[:3].mean()
            short_leg_return = result_df.realized_excess_return.iloc[-3:].mean()
            #index_return = row.index_monthly_return * 100
            new_row = [year, month, long_leg_return, short_leg_return]
            performance_df.loc[len(performance_df)] = new_row

    else:
        for month in range(1, 13):
            first_occurence = dataset_df[(dataset_df.month == month) & (dataset_df.year == year)].iloc[0].name
            train_set = dataset_df[:first_occurence]

            #regressor = DecisionTreeRegressor(random_state = 0, max_depth=5, criterion="absolute_error", min_samples_split=300, min_samples_leaf= 150)
            X = train_set[list_of_features]
            y = train_set[target]
            regressor.fit(X, y)

            result_df = pd.DataFrame(columns=["sector", "predicted_excess_return","realized_excess_return"])
            for sector in list_of_sectors:
                row = dataset_df[(dataset_df.month == month) & (dataset_df.year == year) & (dataset_df.sector == sector)].iloc[0]
                row_input = row[list_of_features]
                predicted_excess_return = regressor.predict([row_input])
                realized_excess_return = row[target] 
                new_row = [sector, predicted_excess_return[0]*100, realized_excess_return*100]
                result_df.loc[len(result_df)] = new_row
            result_df = result_df.sort_values(by="predicted_excess_return",ascending=False).reset_index().drop("index",axis=1)
            long_leg_return = result_df.realized_excess_return.iloc[:3].mean()
            short_leg_return = result_df.realized_excess_return.iloc[-3:].mean()
            #index_return = row.index_monthly_return * 100
            new_row = [year, month, long_leg_return, short_leg_return]
            performance_df.loc[len(performance_df)] = new_row

In [59]:
performance_df

Unnamed: 0,year,month,long_leg_return,short_leg_return
0,1396.0,1.0,21.352619,-2.953850
1,1396.0,2.0,17.525151,-3.027144
2,1396.0,3.0,3.076281,3.347082
3,1396.0,4.0,10.914690,-7.553826
4,1396.0,5.0,-1.596141,-8.612474
...,...,...,...,...
58,1400.0,11.0,12.729292,9.218469
59,1400.0,12.0,12.517564,-1.297154
60,1401.0,1.0,-4.869575,-1.836618
61,1401.0,2.0,14.767256,-4.146369


In [60]:
performance_df["long_leg_return"].gt(0).sum() / len(performance_df)

0.7619047619047619