In [None]:
import pandas as pd # to read data and handle dataframe
import matplotlib.pyplot as plt # to plot graphs
import statsmodels.api as sm # to build the model
from sklearn.preprocessing import MinMaxScaler # to scale the data
from sklearn.model_selection import train_test_split # to split the data into train and test
import numpy as np # to do mathematical operations


In [None]:
bank_data_loc = "Data\EVDS Data.xlsx"

# Read the data
df_bank = pd.read_excel(bank_data_loc)
df_bank = df_bank[:-23] # remove the last 23 rows which are not data

# All of the EVDS data
df_bank.head()

In [None]:
cigkofte_data_loc = "Data\cigkofte.csv"
faiz_orani_data_loc = "Data\mfaiz_orani.csv"
yemeksepeti_data_loc = "Data\yemeksepeti.csv"
kariyernet_data_loc = "Data\kariyer.net.csv"
is_ilani_data_loc = "Data\is_ilanı.csv"
tcmb_faiz_orani_loc = "Data\TCMB_faiz_orani.csv"
kredi_kart_borcu_loc = "Data\kredi_karti_borcu.csv"

# Read all of the data
cigkofte_df = pd.read_csv(cigkofte_data_loc, skiprows=1) # skip the first row which is the column name
faiz_df = pd.read_csv(faiz_orani_data_loc, skiprows=1)
yemeksepeti_df = pd.read_csv(yemeksepeti_data_loc, skiprows=1)
kariyernet_df = pd.read_csv(kariyernet_data_loc, skiprows=1)
is_ilani_df = pd.read_csv(is_ilani_data_loc, skiprows=1)
tcmb_faiz_orani_df = pd.read_csv(tcmb_faiz_orani_loc, skiprows=1)
kredi_kart_borcu_df = pd.read_csv(kredi_kart_borcu_loc, skiprows=1)

# Combine all of the dataframes
merged_df = pd.merge(cigkofte_df, faiz_df, on='Ay')
merged_df = pd.merge(merged_df, yemeksepeti_df, on='Ay')
merged_df = pd.merge(merged_df, kariyernet_df, on='Ay')
merged_df = pd.merge(merged_df, is_ilani_df, on='Ay')
merged_df = pd.merge(merged_df, tcmb_faiz_orani_df, on='Ay')
merged_df = pd.merge(merged_df, kredi_kart_borcu_df, on='Ay')

merged_df["Tarih"] = merged_df["Ay"]
merged_df.drop("Ay", axis=1, inplace=True)

# All of the Google Trends data
merged_df.head() 

In [None]:
df = pd.merge(df_bank, merged_df, on='Tarih') # merge the EVDS data with Google Trends data
df.columns = ["Date", "Unemployement", "Interest(2-14 days)", "Food Price Index", "Cigkofte", "Faiz_Orani", "Yemeksepeti", "Kariyer.net", "Is_ilani", "TCMB Faiz Orani", "Kredi Karti Borcu"]
df.index = df["Date"]
df.drop("Date", axis=1, inplace=True)

# All of the data
df.head()

In [None]:
# To see the data types of the columns
df.info()

In [None]:
# Convert the object type columns to float
df["Unemployement"] = df["Unemployement"].astype(float) 
df["Interest(2-14 days)"] = df["Interest(2-14 days)"].astype(float)

df.info()

In [None]:
# To see the statistical information of the data
df.describe()

In [None]:
scaler = MinMaxScaler() # to scale the data

# Scale all data to 0-100, like the Google Trends data
df["Unemployement"] = scaler.fit_transform(df["Unemployement"].values.reshape(-1,1)) * 100
df["Interest(2-14 days)"] = scaler.fit_transform(df["Interest(2-14 days)"].values.reshape(-1,1)) * 100
df["Food Price Index"] = scaler.fit_transform(df["Food Price Index"].values.reshape(-1,1)) * 100
df["Trend"] = range(1, len(df) + 1) # to create a trend column

df.head()

In [None]:
# To see the correlation between the columns
df.corr()

In [None]:
# To see the correlation between the columns with plots
pd.plotting.scatter_matrix(df, figsize=(20, 20), diagonal="kde", s = 80)

plt.show()

In [None]:
# Plot all of the data
for col in df.columns:
    plt.figure(figsize=(10, 5))
    plt.plot(df["Trend"], df[col], label=col, color="blue")
    plt.title(col)
    plt.show()

In [None]:
# Plot the autocorrelation of the data
for col in df.columns:
    plt.figure(figsize=(10, 5))
    pd.plotting.autocorrelation_plot(df[col])
    plt.title(col)
    plt.xlabel("Lag")
    plt.ylabel("Autocorrelation")
    plt.show()

In [None]:
a = df.copy()

# Drop the columns which does not have a relation with the data in a sense
a.drop(["Faiz_Orani", "Cigkofte", "Yemeksepeti", "TCMB Faiz Orani"], axis=1, inplace=True)  
a.head()

In [None]:
# Plot the response variable
plt.figure(figsize=(10, 5))
plt.plot(a["Trend"], a["Unemployement"], label="Unemployement", color="blue")
plt.show()

In [None]:
a = sm.add_constant(a) # Intercept term
independent_set_1 = a[["const", "Trend", "Food Price Index", "Interest(2-14 days)", "Kredi Karti Borcu", "Kariyer.net", "Is_ilani"]]

lm = sm.OLS(a["Unemployement"], independent_set_1)
result = lm.fit()
print(result.summary())

In [None]:
# Plot the autocorrelation
plt.figure(figsize=(12, 8))
pd.plotting.autocorrelation_plot(df["Unemployement"])
plt.title("Unemployement Autocorrelation")
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
# To show monthly autocorrelation
for i in range(12, 120, 12):
    plt.axvline(x=i, color='black', linestyle='--', label="Lag " + str(i))
plt.legend(loc="best")
plt.show()

In [None]:
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November"]

# Create a column for each month
for m in months:
    a[m] = [1 if int(i.split("-")[1]) - 1 == months.index(m) else 0 for i in a.index]
    
a.head()

In [None]:
 #  Model with monthly categorical variables
independent_set_2 = a[["const", "Trend", "Food Price Index", "Interest(2-14 days)", "Kredi Karti Borcu", "Kariyer.net", "Is_ilani"] + months]

lm = sm.OLS(a["Unemployement"], independent_set_2)
result = lm.fit()
print(result.summary())

In [None]:
# Plot the autocorrelation
plt.figure(figsize=(12, 8))
pd.plotting.autocorrelation_plot(df["Unemployement"])
plt.title("Unemployement Autocorrelation")
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
# To show autocorrelation in a year
for i in range(13): 
    plt.axvline(x=i, color='g', linestyle='--')
plt.show()

In [None]:
year = ["2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]

# Create a column for each year
for y in year:
    a[y] = [1 if int(i.split("-")[0]) == int(y) else 0 for i in a.index]
    
a.head()

In [None]:
# Model with monthly and yearly categorical variables
independent_set_3 = a[["const", "Trend", "Food Price Index", "Interest(2-14 days)", "Kredi Karti Borcu", "Kariyer.net", "Is_ilani"] + months + year]

lm = sm.OLS(a["Unemployement"], independent_set_3)
result = lm.fit()
print(result.summary())

In [None]:
# Model after dropping the columns which are not significant
independent_set_4 = a[["const", "Trend", "Food Price Index", "Interest(2-14 days)", "Is_ilani"] + months + year]

lm = sm.OLS(a["Unemployement"], independent_set_4)
result = lm.fit()
print(result.summary())

In [None]:
# Plot the data
plt.figure(figsize=(12, 7))
plt.plot(a["Trend"], a["Unemployement"], label="Real Values", color="blue")
plt.axvline(x = 45, color='black', linestyle='--', label="Trend Split Point") # to show the split point
plt.legend(loc="best")
plt.show()

In [None]:
a["Trend_1"] = list(range(1, 46)) + [0] * 63
a["Trend_2"] = [0] * 45 + list(range(46, len(a) + 1))

a.head()

In [None]:
# Model with splitted trend
independent_set_5 = a[["const", "Trend", "Food Price Index", "Interest(2-14 days)", "Is_ilani", "Trend_1", "Trend_2"] + months + year]

lm = sm.OLS(a["Unemployement"], independent_set_5)
result = lm.fit()
print(result.summary())

In [None]:
# Compare all of the models

independent_sets = [independent_set_1, independent_set_2, independent_set_3, independent_set_4, independent_set_5]
results_test = []
results_train = []

for set in independent_sets:
    train = []
    test = []
    for _ in range(1000): # 1000 times to get a more stable result
        X_train, X_test, y_train, y_test = train_test_split(set, a["Unemployement"], test_size=0.2) # split the data into train and test
        lm = sm.OLS(y_train, X_train)
        result = lm.fit()
        # MSE
        train.append(np.mean(result.resid ** 2))
        test.append(np.mean((result.predict(X_test) - y_test) ** 2))
        
    results_train.append(np.mean(train)) 
    results_test.append(np.mean(test))

plt.figure(figsize=(10, 5))
plt.plot(range(1, 6), results_train, label="Train")
plt.plot(range(1, 6), results_test, label="Test")
plt.legend(loc="best")
plt.show()

In [None]:
best_set = independent_set_5.copy() # the best set is the one with the lowest test error to prevent overfitting

lm = sm.OLS(a["Unemployement"], best_set)
result = lm.fit()
print(result.summary())

In [None]:
#Plot the fitted values and the real values
plt.figure(figsize=(13, 8))
plt.plot(a["Trend"], a["Unemployement"], label="Real Values", color="blue")
plt.plot(a["Trend"], result.fittedvalues, label="Fitted Values", color="red")
plt.legend(loc="best")
plt.show()

In [None]:
#Check normality of the residuals
result.resid.plot(kind="kde", title="Residuals", figsize=(10, 5))

In [None]:
residual = result.resid

plt.figure(figsize = (12, 7))
plt.scatter(df["Trend"],residual, label = "Residual", color = "blue", s = 10)
plt.title("Residuals of the Model")

plt.show()

In [None]:
plt.figure(figsize=(10, 6))  
plt.acorr(residual, maxlags=len(residual)-1, usevlines = False, marker='o')
plt.axhline(y=0.125, color='red', linestyle='--') 
plt.axhline(y=-0.125, color='red', linestyle='--') 
plt.title("Autocorrelation of Residuals")
plt.xlim(0, )
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
plt.show()

In [None]:
# Read the data
linkedin_data_loc = "Data\linkedin.csv"
linkedin_df = pd.read_csv(linkedin_data_loc, skiprows = 1)
linkedin_df.columns = ["Month", "Linkedin"]

a["Linkedin"] = list(linkedin_df["Linkedin"]) # add the data to the dataframe

# Read the data
issizlik_maasi_data_loc = "Data\issizlikmaasi.csv"
issizlik_maasi_df = pd.read_csv(issizlik_maasi_data_loc, skiprows = 1)
issizlik_maasi_df.columns = ["Month", "Issizlik Maasi"]

a["İssizlik Maasi"] = list(issizlik_maasi_df["Issizlik Maasi"]) # add the data to the dataframe


a.head()

In [None]:
# Plot the new data
plt.figure(figsize=(10, 5))

plt.plot(a["Trend"], a["Linkedin"], label="Linkedin", color="red")
plt.plot(a["Trend"], a["İssizlik Maasi"], label="İssizlik Maasi", color="green")
plt.legend(loc="best")

plt.show()

In [None]:
# Model with additional data
independent_set_6 = a[["const", "Trend", "Food Price Index", "Interest(2-14 days)", "Is_ilani", "Trend_1", "Trend_2", "Linkedin", "İssizlik Maasi"] + months + year]

lm = sm.OLS(a["Unemployement"], independent_set_6)
result = lm.fit()
print(result.summary())

In [None]:
#Plot the fitted values and the real values
plt.figure(figsize=(13, 8))
plt.plot(a["Trend"], a["Unemployement"], label="Real Values", color="blue")
plt.plot(a["Trend"], result.fittedvalues, label="Fitted Values", color="red")
plt.legend(loc="best")
plt.show()

In [None]:
#Check normality of the residuals
result.resid.plot(kind="kde", title="Residuals", figsize=(10, 5))

In [None]:
residual = result.resid

plt.figure(figsize = (12, 7))
plt.scatter(df["Trend"],residual, label = "Residual", color = "blue", s = 10)
plt.title("Residuals of the Model")

plt.show()

In [None]:
plt.figure(figsize=(10, 6))  
plt.acorr(residual, maxlags=len(residual)-1, usevlines = False, marker='o')
plt.axhline(y=0.125, color='red', linestyle='--') 
plt.axhline(y=-0.125, color='red', linestyle='--') 
plt.title("Autocorrelation of Residuals")
plt.xlim(0, )
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
plt.show()

In [None]:
# Delete variables 
del a, independent_set_1, independent_set_2, independent_set_3, independent_set_4, independent_set_5, independent_set_6 ,best_set, lm, result, residual

In [None]:
a = df.copy()

# Drop the columns which does not have a relation with the data in a sense
a.drop(["TCMB Faiz Orani", "Kredi Karti Borcu", "Is_ilani", "Kariyer.net", "Faiz_Orani"], axis=1, inplace=True)  

a.head()

In [None]:
# Plot the response variable
plt.figure(figsize=(12, 8))
plt.plot(a["Trend"], a["Food Price Index"], color="blue")
plt.show()

In [None]:
a = sm.add_constant(a) # Intercept term

# Model with the columns which are related to the response variable by sense
independent_set_1 = a[["const", "Cigkofte", "Unemployement", "Interest(2-14 days)", "Trend", "Yemeksepeti"]]

lm = sm.OLS(a["Food Price Index"], independent_set_1)
result = lm.fit()
print(result.summary())

In [None]:
# Plot the autocorrelation
plt.figure(figsize=(10, 6))
pd.plotting.autocorrelation_plot(df["Food Price Index"])
plt.title("Food Price Index Autocorrelation")
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
# To show autocorrelation in yearly basis
for _ in range(13): 
    plt.axvline(x=_, color='g', linestyle='--')
plt.legend(loc="best")
plt.show()

In [None]:
year = ["2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022"] # No 2023 because there is intercept

# Create a column for each year
for y in year:
    a[y] = [1 if int(i.split("-")[0]) == int(y) else 0 for i in a.index]
    
a.head()

In [None]:
# Model with yearly categorical variables
independent_set_2 = a[["const", "Cigkofte", "Unemployement", "Interest(2-14 days)", "Trend", "Yemeksepeti"] + year]

lm = sm.OLS(a["Food Price Index"], independent_set_2)
result = lm.fit()
print(result.summary())

In [None]:
# Model after dropping the columns which are not significant
independent_set_3 = a[["const", "Unemployement", "Interest(2-14 days)", "Trend", "Yemeksepeti"] + year]

lm = sm.OLS(a["Food Price Index"], independent_set_3)
result = lm.fit()
print(result.summary())

In [None]:
# To see relations between the data
for col in a:
    if col not in year:
        plt.figure(figsize=(10, 5))
        plt.plot(a["Food Price Index"], a[col], label=col, color="blue")
        plt.legend(loc="best")
        plt.show()

In [None]:
a["e^Trend"] = np.exp(-a["Trend"])
a["Interest(2-14 days)_2"] = [0] * 38 + [np.exp(-i) for i in a["Interest(2-14 days)"][:70]]
a["Cigkofte_2"] = [np.exp(-i) for i in a["Cigkofte"][:18]] + [0] * len(a["Cigkofte"][18:])
a["Cigkofte_3"] = [0] * len(a["Cigkofte"][:65]) + [1 for i in a["Cigkofte"][65:]] 
a["Trend_2"] = [0] * 81 + [1 for i in a["Trend"][81:]]

In [None]:
# Model after adding new variables
independent_set_4 = a[["const", "Unemployement", "Interest(2-14 days)", "Trend", "Yemeksepeti", "e^Trend", "Interest(2-14 days)_2", "Cigkofte_2", "Cigkofte_3", "Trend_2"] + year]

lm = sm.OLS(a["Food Price Index"], independent_set_4)
result = lm.fit()
print(result.summary())

In [None]:
# Model after dropping the columns which are not significant
independent_set_5 = a[["const", "Unemployement", "Interest(2-14 days)", "Trend", "Yemeksepeti", "e^Trend", "Interest(2-14 days)_2", "Cigkofte_3", "Trend_2"] + year]

lm = sm.OLS(a["Food Price Index"], independent_set_5)
result = lm.fit()

In [None]:
# Model with splitted trend
independent_set_6 = a[["const", "Unemployement", "Interest(2-14 days)", "Trend", "Yemeksepeti", "e^Trend"] + year]

lm = sm.OLS(a["Food Price Index"], independent_set_4)
result = lm.fit()
print(result.summary())

In [None]:
# Compare all of the models
independent_sets = [independent_set_1, independent_set_2, independent_set_3, independent_set_4, independent_set_5, independent_set_6]
results_test = []
results_train = []

for set in independent_sets:
    train = []
    test = []
    for _ in range(1000): # 1000 times to get a more stable result
        X_train, X_test, y_train, y_test = train_test_split(set, a["Unemployement"], test_size=0.2) # split the data into train and test
        lm = sm.OLS(y_train, X_train)
        result = lm.fit()
        #MSE
        train.append(np.mean(result.resid ** 2))
        test.append(np.mean((result.predict(X_test) - y_test) ** 2))
        
    results_train.append(np.mean(train)) 
    results_test.append(np.mean(test))

plt.figure(figsize=(10, 5))
plt.plot(range(1, 7), results_train, label="Train")
plt.plot(range(1, 7), results_test, label="Test")
plt.legend(loc="best")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(range(3, 7), results_train[2:], label="Train")
plt.plot(range(3, 7), results_test[2:], label="Test")
plt.legend(loc="best")
plt.show()

In [None]:
best_set = independent_set_3.copy()

lm = sm.OLS(a["Food Price Index"], best_set)
result = lm.fit()
print(result.summary())

In [None]:
# Plot the fitted values and the real values
plt.figure(figsize=(13, 8))
plt.plot(a["Trend"], a["Food Price Index"], label="Real Values", color="blue")
plt.plot(a["Trend"], result.fittedvalues, label="Fitted Values", color="red")
plt.legend(loc="best")
plt.show()

In [None]:
# Check normality of the residuals
result.resid.plot(kind="kde", title="Residuals", figsize=(10, 5))

In [None]:
# Check autocorrelation of the residuals
residual = result.resid

plt.figure(figsize = (12, 7))
plt.scatter(df["Trend"],residual, label = "Residual", color = "blue", s = 10)
plt.title("Residuals of the Model")

plt.show()

In [None]:
plt.figure(figsize=(10, 6))  
plt.acorr(residual, maxlags=len(residual)-1, usevlines = False, marker='o')
plt.axhline(y=0.125, color='red', linestyle='--') 
plt.axhline(y=-0.125, color='red', linestyle='--') 
plt.title("Autocorrelation of Residuals")
plt.xlim(0, )
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
plt.show()

In [None]:
# Read the data
getir_data_loc = "Data\getir.csv"
getir_df = pd.read_csv(getir_data_loc, skiprows = 1)
getir_df.columns = ["Month", "Getir"]

a["Getir"] = list(getir_df["Getir"]) # add the data to the dataframe

# Read the data
yemek_tarifleri_data_loc = "Data\yemek_tarifleri.csv"
yemek_tarifleri_df = pd.read_csv(yemek_tarifleri_data_loc, skiprows = 1)
yemek_tarifleri_df.columns = ["Month", "Yemek Tarifleri"]

a["Yemek Tarifleri"] = list(yemek_tarifleri_df["Yemek Tarifleri"]) # add the data to the dataframe

# Read the data
market_fiyatları_data_loc = "Data\market_fiyatları.csv"
market_fiyatları_df = pd.read_csv(market_fiyatları_data_loc, skiprows = 1)
market_fiyatları_df.columns = ["Month", "Market Fiyatları"]

a["Market Fiyatları"] = list(market_fiyatları_df["Market Fiyatları"]) # add the data to the dataframe

a.head()

In [None]:
# Plot the new data
plt.figure(figsize=(10, 5))

plt.plot(a["Trend"], a["Getir"], label="Getir", color="red")
plt.plot(a["Trend"], a["Yemek Tarifleri"], label="Yemek Tarifleri", color="green")
plt.plot(a["Trend"], a["Market Fiyatları"], label="Market Fiyatları", color="blue")

plt.legend(loc="best")

plt.show()

In [None]:
# Model with additional data
independent_set_7 = a[["const", "Unemployement", "Interest(2-14 days)", "Trend", "Yemeksepeti", "Getir", "Yemek Tarifleri", "Market Fiyatları"] + year]

lm = sm.OLS(a["Food Price Index"], independent_set_7)
result = lm.fit()
print(result.summary())

In [None]:
# Plot the fitted values and the real values
plt.figure(figsize=(13, 8))
plt.plot(a["Trend"], a["Food Price Index"], label="Real Values", color="blue")
plt.plot(a["Trend"], result.fittedvalues, label="Fitted Values", color="red")
plt.legend(loc="best")
plt.show()

In [None]:
# Check normality of the residuals
result.resid.plot(kind="kde", title="Residuals", figsize=(10, 5))

In [None]:
# Check autocorrelation of the residuals
residual = result.resid

plt.figure(figsize = (12, 7))
plt.scatter(df["Trend"],residual, label = "Residual", color = "blue", s = 10)
plt.title("Residuals of the Model")

plt.show()

In [None]:
plt.figure(figsize=(10, 6))  
plt.acorr(residual, maxlags=len(residual)-1, usevlines = False, marker='o')
plt.axhline(y=0.125, color='red', linestyle='--') 
plt.axhline(y=-0.125, color='red', linestyle='--') 
plt.title("Autocorrelation of Residuals")
plt.xlim(0, )
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
plt.show()

In [None]:
# Delete variables
del a, independent_set_1, independent_set_2, independent_set_3, independent_set_4, independent_set_5, independent_set_6, independent_set_7, lm, result, residual, best_set, independent_sets

In [None]:
a = df.copy()

# Drop the columns which does not have a relation with the data in a sense
a.drop(["Yemeksepeti", "Kariyer.net", "Is_ilani", "Cigkofte"], axis=1, inplace=True)  

a.head()

In [None]:
# Plot the response variable
plt.figure(figsize=(12, 8))
plt.plot(a["Trend"], a["Interest(2-14 days)"], color="blue")
plt.show()

In [None]:
a = sm.add_constant(a) # Intercept term

# Model with the columns which are related to the response variable by sense
independent_set_1 = a[["const", "Faiz_Orani", "Unemployement", "Food Price Index", "Trend", "TCMB Faiz Orani", "Kredi Karti Borcu"]]

lm = sm.OLS(a["Interest(2-14 days)"], independent_set_1)
result = lm.fit()
print(result.summary())

In [None]:
split_points = ["2015-01","2016-11","2018-04", "2018-07", "2019-07", "2020-02", "2020-10", "2022-07", "2023-05"] # Split points

for i in range(1, len(split_points) - 1):
    a["TCMB_" + str(i)] = [0] * len(a)
    
for i in range(1, len(split_points)):
    a["TCMB_" + str(i)] = [1 if split_points[i-1] <= index and index < split_points[i] else 0 for index in a.index]

a["TCMB_" + str(len(split_points))] = [1 if split_points[-1] <= index else 0 for index in a.index]

tcmb_lst = ["TCMB_" + str(i) for i in range(1, len(split_points))] # TCMB columns

In [None]:
# Model with splitted TCMB data
independent_set_2 = a[["const", "Faiz_Orani", "Unemployement", "Food Price Index", "Trend", "TCMB Faiz Orani", "Kredi Karti Borcu"] + tcmb_lst]

lm = sm.OLS(a["Interest(2-14 days)"], independent_set_2)
result = lm.fit()
print(result.summary())

In [None]:
plt.figure(figsize=(13, 8))
plt.plot(a["Trend"], a["Interest(2-14 days)"], label="Real Values", color="blue")
plt.plot(a["Trend"], result.fittedvalues, label="Fitted Values", color="red")

In [None]:
plt.figure(figsize=(13, 8))
plt.plot(a["Trend"], a["Interest(2-14 days)"], label="Real Values", color="blue")
plt.plot(a["Trend"], result.fittedvalues, label="Fitted Values", color="red")

plt.axvline(x=45, color="black", linestyle="--", label="Trend Split Interval 1")
plt.axvline(x=55, color="black", linestyle="--", label="Trend Split Interval 1")

plt.axvline(x=100, color="green", linestyle="--", label="Trend Split Interval 2")
plt.axvline(x=len(a), color="green", linestyle="--", label="Trend Split Interval 2")

plt.axvline(x=73, color="orange", linestyle="--", label="Trend Split Interval 3")
plt.axvline(x=83, color="orange", linestyle="--", label="Trend Split Interval 3")

plt.axvline(x=23, color="purple", linestyle="--", label="Trend Split Interval 4")
plt.axvline(x=40, color="purple", linestyle="--", label="Trend Split Interval 4")

plt.legend(loc="best")
plt.show()

In [None]:
# New additions
a["Trend_2"] = [0] * 100 + list(range(1, len(a["Trend"][100:]) + 1))
a["Trend_3"] = [0] * 45 + list(range(1, len(a["Trend"][45:55]) + 1)) + [0]* 53
a["Trend_4"] = [0] * 73 + list(range(1, len(a["Trend"][73:83]) + 1)) + [0]* 25
a["Trend_5"] = [0] * 23 + list(range(1, len(a["Trend"][23:40]) + 1)) + [0]* 68

trend_lst = ["Trend_2", "Trend_3", "Trend_4", "Trend_5"] # Trend columns

In [None]:
# Model with new data
independent_set_3 = a[["const", "Faiz_Orani", "Unemployement", "Food Price Index", "Trend", "TCMB Faiz Orani", "Kredi Karti Borcu"] + tcmb_lst + trend_lst]

lm = sm.OLS(a["Interest(2-14 days)"], independent_set_3)
result = lm.fit()
print(result.summary())

In [None]:
# Model after dropping the columns which are not significant
independent_set_4 = a[["const", "Faiz_Orani", "Food Price Index", "Trend", "TCMB Faiz Orani", "Trend_2", "Trend_3", "Trend_4"] + tcmb_lst + trend_lst]

lm = sm.OLS(a["Interest(2-14 days)"], independent_set_4)
result = lm.fit()
print(result.summary())

In [None]:
# Compare all of the models
independent_sets = [independent_set_1, independent_set_2, independent_set_3, independent_set_4]
results_test = []
results_train = []

for set in independent_sets:
    train = []
    test = []
    for _ in range(1000): # 1000 times to get a more stable result
        X_train, X_test, y_train, y_test = train_test_split(set, a["Interest(2-14 days)"], test_size=0.2) # split the data into train and test
        lm = sm.OLS(y_train, X_train)
        result = lm.fit()
        #MSE
        train.append(np.mean(result.resid ** 2))
        test.append(np.mean((result.predict(X_test) - y_test) ** 2))
        
    results_train.append(np.mean(train)) 
    results_test.append(np.mean(test))

plt.figure(figsize=(10, 5))
plt.plot(range(1, 5), results_train, label="Train")
plt.plot(range(1, 5), results_test, label="Test")
plt.legend(loc="best")
plt.show()

In [None]:
best_set = independent_set_4.copy()

lm = sm.OLS(a["Interest(2-14 days)"], best_set)
result = lm.fit()
print(result.summary())

In [None]:
plt.figure(figsize=(13, 8))
plt.plot(a["Trend"], a["Interest(2-14 days)"], label="Real Values", color="blue")
plt.plot(a["Trend"], result.fittedvalues, label="Fitted Values", color="red")
plt.legend(loc="best")
plt.show()

In [None]:
result.resid.plot(kind="kde", title="Residuals", figsize=(10, 5))

In [None]:
residual = result.resid

plt.figure(figsize = (12, 7))
plt.scatter(df["Trend"],residual, label = "Residual", color = "blue", s = 10)
plt.title("Residuals of the Model")

plt.show()

In [None]:
plt.figure(figsize=(10, 6))  
plt.acorr(residual, maxlags=len(residual)-1, usevlines = False, marker='o')
plt.axhline(y=0.125, color='red', linestyle='--') 
plt.axhline(y=-0.125, color='red', linestyle='--') 
plt.title("Autocorrelation of Residuals")
plt.xlim(0, )
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
plt.show()