In [57]:
pip install dmba



In [58]:
import math
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import statsmodels.formula.api as sm
from statsmodels.tsa import tsatools, stattools
from statsmodels.graphics import tsaplots
from sklearn.metrics import mean_squared_error, mean_absolute_error
import dmba

In [59]:
kagSearch_df = pd.read_csv("Kaggle_SearchTerm.csv")
kagSearch_df.columns

Index(['Unnamed: 0', 'week', 'analytics', 'api', 'artificial intelligence',
       'big data', 'clustering', 'data mining', 'data science',
       'data scientist', 'data warehouse', 'deep learning', 'etl', 'excel',
       'github', 'hadoop', 'iot', 'java', 'machine learning', 'matlab',
       'minitab', 'modeling', 'python', 'R', 'regression', 'sql',
       'statistician'],
      dtype='object')

In [60]:
kagSearch_df.head()

Unnamed: 0.1,Unnamed: 0,week,analytics,api,artificial intelligence,big data,clustering,data mining,data science,data scientist,...,java,machine learning,matlab,minitab,modeling,python,R,regression,sql,statistician
0,1,2014-12-21,63,50,23,53,54,68,11,17,...,80,14,45,40,74,29,73,30,62,42
1,2,2014-12-28,59,52,22,54,46,60,13,17,...,77,13,40,37,77,28,59,37,53,56
2,3,2015-01-04,90,55,28,76,71,88,18,24,...,85,18,62,64,96,35,59,44,87,62
3,4,2015-01-11,94,57,33,76,73,88,20,25,...,86,21,71,67,98,37,60,47,94,70
4,5,2015-01-18,93,58,33,78,72,91,19,22,...,92,19,77,80,100,38,61,47,96,64


In [61]:
kagSearch_df.columns = kagSearch_df.columns.str.replace(' ', '_')
kagSearch_df.columns

Index(['Unnamed:_0', 'week', 'analytics', 'api', 'artificial_intelligence',
       'big_data', 'clustering', 'data_mining', 'data_science',
       'data_scientist', 'data_warehouse', 'deep_learning', 'etl', 'excel',
       'github', 'hadoop', 'iot', 'java', 'machine_learning', 'matlab',
       'minitab', 'modeling', 'python', 'R', 'regression', 'sql',
       'statistician'],
      dtype='object')

In [62]:
# convert the date information to a datetime object
kagSearch_df['week'] = pd.to_datetime(kagSearch_df['week'])


In [63]:
# convert dataframe column to series (name is used to label the data)
kagSearch_ts = pd.Series(kagSearch_df.artificial_intelligence.values,index=kagSearch_df.week, name='artificial_intelligence')

In [64]:
#add a trend: ct= constant and linear trend
kagSearch_df = tsatools.add_trend(kagSearch_ts, trend='t')

In [65]:
kagSearch_df.head(10)

Unnamed: 0_level_0,artificial_intelligence,trend
week,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-12-21,23,1.0
2014-12-28,22,2.0
2015-01-04,28,3.0
2015-01-11,33,4.0
2015-01-18,33,5.0
2015-01-25,31,6.0
2015-02-01,33,7.0
2015-02-08,32,8.0
2015-02-15,34,9.0
2015-02-22,34,10.0


In [66]:
nValid = 78
nTrain = len(kagSearch_ts) - nValid

In [67]:
# partition the data
train_df = kagSearch_df[:nTrain]
valid_df = kagSearch_df[nTrain:]

In [68]:
# Fit linear model using training set and predict on validation set
kagSearch_lm = sm.ols(formula='artificial_intelligence ~ trend', data=train_df).fit()
print(kagSearch_lm.summary())

# as p-value is less than 0.05, trend has good relationship with artificial_intelligence

                               OLS Regression Results                              
Dep. Variable:     artificial_intelligence   R-squared:                       0.766
Model:                                 OLS   Adj. R-squared:                  0.765
Method:                      Least Squares   F-statistic:                     593.4
Date:                     Thu, 03 Oct 2024   Prob (F-statistic):           4.97e-59
Time:                             00:31:45   Log-Likelihood:                -645.95
No. Observations:                      183   AIC:                             1296.
Df Residuals:                          181   BIC:                             1302.
Df Model:                                1                                         
Covariance Type:                 nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
In

In [69]:
# a) Model with trend (root mean square error for linear regression)
predict_df = kagSearch_lm.predict(valid_df)
mse = mean_squared_error(valid_df['artificial_intelligence'], predict_df)
rmse = np.sqrt(mse)
print(rmse)

12.443989681580987


In [70]:
# b) Exponential Model (root mean square error for linear regression of logarithamic of dependent variable)
kagSearch_lm_log = sm.ols(formula='np.log(artificial_intelligence) ~ trend', data=train_df).fit()

predict_df_expo = np.exp(kagSearch_lm_log.predict(valid_df))

kagSearch_lm_log.summary()

mse = mean_squared_error(valid_df['artificial_intelligence'], predict_df_expo)
rmse = np.sqrt(mse)
print(rmse)

24.368240001795662


In [71]:
# c) Polynomial Model (root mean square error for linear regression of polynomial of trend)
kagSearch_lm_poly = sm.ols(formula='artificial_intelligence ~ trend + np.square(trend)', data=train_df).fit()
predict_df_poly = kagSearch_lm_poly.predict(valid_df)
kagSearch_lm_poly.summary()

mse = mean_squared_error(valid_df['artificial_intelligence'], predict_df_poly)
rmse = np.sqrt(mse)
print(rmse)

39.387123094367574


In [72]:
# d) Model with months controlled (Model with months controlled)
kagSearch_df['Month'] = kagSearch_df.index.month

# partition the data
train_df = kagSearch_df[:nTrain]
valid_df = kagSearch_df[nTrain:]

kagSearch_lm_season = sm.ols(formula='artificial_intelligence ~ C(Month)', data=train_df).fit()
predict_df_season = kagSearch_lm_season.predict(valid_df)
kagSearch_lm_season.summary()


mse = mean_squared_error(valid_df['artificial_intelligence'], predict_df_season)
rmse = np.sqrt(mse)
print(rmse)

27.44233036018805


In [73]:
# e) Model with month and trend


kagSearch_df['Month'] = kagSearch_df.index.month

# partition the data
train_df = kagSearch_df[:nTrain]
valid_df = kagSearch_df[nTrain:]

kagSearch_lm_season = sm.ols(formula='artificial_intelligence ~ C(Month)+ trend', data=train_df).fit()
predict_df_season = kagSearch_lm_season.predict(valid_df)
kagSearch_lm_season.summary()


mse = mean_squared_error(valid_df['artificial_intelligence'], predict_df_season)
rmse = np.sqrt(mse)
print(rmse)


11.390964445586492


In [74]:
# f) exponential model with month and trend

kagSearch_lm_log = sm.ols(formula='np.log(artificial_intelligence) ~ C(Month)+ trend', data=train_df).fit()

predict_df_expo = np.exp(kagSearch_lm_log.predict(valid_df))

kagSearch_lm_log.summary()

mse = mean_squared_error(valid_df['artificial_intelligence'], predict_df_expo)
rmse = np.sqrt(mse)
print(rmse)


23.57762014738725


In [75]:
# h) Polynomial model with month and trend


kagSearch_lm_poly = sm.ols(formula='artificial_intelligence ~ C(Month)+trend + np.square(trend)', data=train_df).fit()
predict_df_poly = kagSearch_lm_poly.predict(valid_df)
kagSearch_lm_poly.summary()

mse = mean_squared_error(valid_df['artificial_intelligence'], predict_df_poly)
rmse = np.sqrt(mse)
print(rmse)


39.38752907515205
