In [1]:
import pandas as pd
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf,plot_predict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.api import SimpleExpSmoothing
from prophet import Prophet

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [2]:
tiobe_df = pd.read_csv('tiobeindex.csv')
wiki_df = pd.read_csv('wiki.csv')
gtrend_df = pd.read_csv('googletrends.csv')
stack_df = pd.read_csv('stackOverflow.csv')


In [3]:
languages = list(tiobe_df.columns)
languages.remove('Month')
df_list = [tiobe_df, wiki_df, gtrend_df, stack_df]
df_dict = {0:'tiobe', 1:'wikipedia', 2:'gtrend', 3:'stackOverflow'}

di = {}
di['language'] = languages

In [4]:
#ARIMA MODEL

for i, df_all in enumerate(df_list): 
    di['arima_mae_' + df_dict[i]] = []
    di['arima_mse_' + df_dict[i]] = []
    di['arima_rmse_' + df_dict[i]] = []

    for language in languages:
        if language in df_all.columns:
            df = df_all[language]

            splits = 5 if i==3 else 10
            tscv = TimeSeriesSplit(n_splits=splits)
            
            fold_mae, fold_mse, fold_rmse = [],[],[]

            for train_index, test_index in tscv.split(df):
                train, test = df.iloc[train_index], df.iloc[test_index]
                
                model = pm.auto_arima(train, seasonal=False, stepwise=True, trace=False)
                p, d, q = model.order
                
                arima_model = ARIMA(train, order=(p, d, q))
                model = arima_model.fit()

                forecast = model.forecast(steps=len(test))

                mae = mean_absolute_error(test, forecast)
                mse = mean_squared_error(test, forecast)
                rmse = np.sqrt(mse)

                fold_mae.append(mae)
                fold_mse.append(mse)
                fold_rmse.append(rmse)
        
        else:
            fold_mae, fold_mse, fold_rmse = [-1],[-1],[-1]

        avg_mae = np.mean(fold_mae)
        avg_mse = np.mean(fold_mse)
        avg_rmse = np.mean(fold_rmse)

        di['arima_mae_'+ df_dict[i]].append(avg_mae)
        di['arima_mse_' + df_dict[i]].append(avg_mse)
        di['arima_rmse_' + df_dict[i]].append(avg_rmse)

# df_res = pd.DataFrame.from_dict(di)
# df_res

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  w

In [5]:
#SIMPLE EXPONENTIAL MODEL

for i, df_all in enumerate(df_list): 
    di['ses_mae_' + df_dict[i]] = []
    di['ses_mse_' + df_dict[i]] = []
    di['ses_rmse_' + df_dict[i]] = []

    for language in languages:
        if language in df_all.columns:
            df1 = df_all[language]
            mae_list, mse_list, rmse_list = [], [], []

            splits = 5 if i==3 else 10
            tscv = TimeSeriesSplit(n_splits=splits)
            
            for train_index, test_index in tscv.split(df1):
                train, test = df1[train_index], df1[test_index]
                
                ses = SimpleExpSmoothing(train)
                model = ses.fit(optimized=True)
                
                forecast = model.forecast(steps=len(test))
                
                mae = mean_absolute_error(test, forecast)
                mse = mean_squared_error(test, forecast)
                rmse = np.sqrt(mse)
                
                mae_list.append(mae)
                mse_list.append(mse)
                rmse_list.append(rmse)
        
        else:
            mae_list, mse_list, rmse_list = [-1],[-1],[-1]

        avg_mae = np.mean(mae_list)
        avg_mse = np.mean(mse_list)
        avg_rmse = np.mean(rmse_list)
        
        di['ses_mae_'+ df_dict[i]].append(avg_mae)
        di['ses_mse_' + df_dict[i]].append(avg_mse)
        di['ses_rmse_' + df_dict[i]].append(avg_rmse)


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

In [6]:
# HOLT-WINTERS MODEL

for i, df_all in enumerate(df_list): 
    di['holt_mae_' + df_dict[i]] = []
    di['holt_mse_' + df_dict[i]] = []
    di['holt_rmse_' + df_dict[i]] = []

    for language in languages:
        if language in df_all.columns:
            df1 = df_all[language]
            mae_list, mse_list, rmse_list = [], [], []

            splits = 5 if i==3 else 10
            tscv = TimeSeriesSplit(n_splits=splits)
            
            for train_index, test_index in tscv.split(df1):
                train, test = df1[train_index], df1[test_index]
                
                holt_model = ExponentialSmoothing(train, trend="additive", seasonal=None)
                model = holt_model.fit(optimized=True)
                
                forecast = model.forecast(steps=len(test))
                
                mae = mean_absolute_error(test, forecast)
                mse = mean_squared_error(test, forecast)
                rmse = np.sqrt(mse)
                
                mae_list.append(mae)
                mse_list.append(mse)
                rmse_list.append(rmse)
        
        else:
            mae_list, mse_list, rmse_list = [-1],[-1],[-1]

        avg_mae = np.mean(mae_list)
        avg_mse = np.mean(mse_list)
        avg_rmse = np.mean(rmse_list)
        
        di['holt_mae_'+ df_dict[i]].append(avg_mae)
        di['holt_mse_' + df_dict[i]].append(avg_mse)
        di['holt_rmse_' + df_dict[i]].append(avg_rmse)


  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.nobs) + k * 2
  bic = self.nobs * np.log(sse / self.nobs) + k * np.log(self.nobs)
  aic = self.nobs * np.log(sse / self.no

In [8]:
# PROPHET MODEL

for i, df_all in enumerate(df_list): 
    di['prophet_mae_' + df_dict[i]] = []
    di['prophet_mse_' + df_dict[i]] = []
    di['prophet_rmse_' + df_dict[i]] = []

    for language in languages:
        if language in df_all.columns:
            tm = 'year' if i==3 else 'Month'
            df1 = df_all[[tm, language]]
            mae_list, mse_list, rmse_list = [], [], []

            splits = 5 if i==3 else 10
            tscv = TimeSeriesSplit(n_splits=splits)
            
            for train_index, test_index in tscv.split(df1):
                train, test = df1.iloc[train_index], df1.iloc[test_index]

                prophet_df = train.reset_index(drop=True)
                prophet_df.columns = ['ds', 'y']

                prophet_model = Prophet()
                prophet_model.fit(prophet_df)

                future = prophet_model.make_future_dataframe(periods=len(test))
                forecast = prophet_model.predict(future)

                test_df = test.reset_index(drop=True)
                test_df.columns = ['ds', 'y']

                forecast_test = forecast[-len(test):]

                mae = mean_absolute_error(test_df['y'], forecast_test['yhat'])
                mse = mean_squared_error(test_df['y'], forecast_test['yhat'])
                rmse = np.sqrt(mse)

                mae_list.append(mae)
                mse_list.append(mse)
                rmse_list.append(rmse)
        
        else:
            mae_list, mse_list, rmse_list = [-1],[-1],[-1]

        avg_mae = np.mean(mae_list)
        avg_mse = np.mean(mse_list)
        avg_rmse = np.mean(rmse_list)
        
        di['prophet_mae_'+ df_dict[i]].append(avg_mae)
        di['prophet_mse_' + df_dict[i]].append(avg_mse)
        di['prophet_rmse_' + df_dict[i]].append(avg_rmse)




17:56:46 - cmdstanpy - INFO - Chain [1] start processing
17:56:47 - cmdstanpy - INFO - Chain [1] done processing
17:56:47 - cmdstanpy - INFO - Chain [1] start processing
17:56:47 - cmdstanpy - INFO - Chain [1] done processing
17:56:47 - cmdstanpy - INFO - Chain [1] start processing
17:56:48 - cmdstanpy - INFO - Chain [1] done processing
17:56:48 - cmdstanpy - INFO - Chain [1] start processing
17:56:48 - cmdstanpy - INFO - Chain [1] done processing
17:56:48 - cmdstanpy - INFO - Chain [1] start processing
17:56:48 - cmdstanpy - INFO - Chain [1] done processing
17:56:48 - cmdstanpy - INFO - Chain [1] start processing
17:56:48 - cmdstanpy - INFO - Chain [1] done processing
17:56:48 - cmdstanpy - INFO - Chain [1] start processing
17:56:48 - cmdstanpy - INFO - Chain [1] done processing
17:56:48 - cmdstanpy - INFO - Chain [1] start processing
17:56:49 - cmdstanpy - INFO - Chain [1] done processing
17:56:49 - cmdstanpy - INFO - Chain [1] start processing
17:56:49 - cmdstanpy - INFO - Chain [1]

In [11]:
df_res = pd.DataFrame.from_dict(di)
df_res.to_csv('result_timeseries.csv', index=False)
df_res.head()

Unnamed: 0,language,arima_mae_tiobe,arima_mse_tiobe,arima_rmse_tiobe,arima_mae_wikipedia,arima_mse_wikipedia,arima_rmse_wikipedia,arima_mae_gtrend,arima_mse_gtrend,arima_rmse_gtrend,...,prophet_rmse_tiobe,prophet_mae_wikipedia,prophet_mse_wikipedia,prophet_rmse_wikipedia,prophet_mae_gtrend,prophet_mse_gtrend,prophet_rmse_gtrend,prophet_mae_stackOverflow,prophet_mse_stackOverflow,prophet_rmse_stackOverflow
0,Python,1.405189,3.865813,1.672752,3519.904693,35013910.0,4103.946983,9.025607,171.094348,11.188161,...,2.988886,5771.58699,110358500.0,6206.900651,18.187554,698.94578,20.639671,3552.61842,12807380.0,3552.61842
1,C++,1.181518,2.67855,1.459072,2613.776839,11911870.0,3123.140458,13.382721,253.50079,15.801334,...,6.705171,6023.484988,75045750.0,6905.593804,25.381401,1324.071108,29.850327,1746.713,3224397.0,1746.713
2,Java,2.23388,8.844348,2.547528,1021.372259,1652031.0,1217.947891,5.863698,76.668768,6.503008,...,4.960124,2503.796099,12370800.0,2868.655406,21.07563,880.992215,23.213069,2867.029145,9043797.0,2867.029145
3,C,1.77918,5.55118,2.100857,638.768926,999271.2,806.766976,5.415315,52.28501,6.66218,...,5.331852,1198.145757,2886672.0,1401.697157,12.666508,420.497425,14.371869,1160.176085,1408041.0,1160.176085
4,C#,0.9462,1.515407,1.118268,74.926822,24084.63,101.447311,7.632348,89.561829,8.693126,...,2.766598,123.28261,48073.71,150.217772,35.905661,3015.845638,40.330425,1631.225811,2918202.0,1631.225811
