## Performing Linear Regression(OLS)

In [2]:
import statsmodels.api as sm
import numpy as np
import pandas as pd
%matplotlib inline

np.random.seed(33)

#### OLS estimation with artificial data

In [3]:
n_sample=1000
X=np.linspace(0,10,1000)
X=np.column_stack((X,X**2))
beta = np.array([1,0.1,10])   #Here beta is the coefficients of x and x^2



e=np.random.normal(size=n_sample)

In [4]:
#Our model needs an intercept so we add a column of 1s:

X = sm.add_constant(X)
y = np.dot(X,beta) + e
np.shape(y)
#np.shape(beta)


(1000,)

### Fit and summary

In [5]:
model =  sm.OLS(y,X)
results=model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 4.768e+07
Date:                Sun, 15 Oct 2023   Prob (F-statistic):               0.00
Time:                        18:49:51   Log-Likelihood:                -1384.3
No. Observations:                1000   AIC:                             2775.
Df Residuals:                     997   BIC:                             2789.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9326      0.092     10.182      0.0

In [6]:
dir(results)

['HC0_se',
 'HC1_se',
 'HC2_se',
 'HC3_se',
 '_HCCM',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abat_diagonal',
 '_cache',
 '_data_attr',
 '_data_in_cache',
 '_get_robustcov_results',
 '_get_wald_nonlinear',
 '_is_nested',
 '_transform_predict_exog',
 '_use_t',
 '_wexog_singular_values',
 'aic',
 'bic',
 'bse',
 'centered_tss',
 'compare_f_test',
 'compare_lm_test',
 'compare_lr_test',
 'condition_number',
 'conf_int',
 'conf_int_el',
 'cov_HC0',
 'cov_HC1',
 'cov_HC2',
 'cov_HC3',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'df_model',
 'df_resid',
 'diagn',
 'eigenvals',
 'el_test',
 'ess',
 'f_pvalue',
 'f_test',
 'fittedvalues',
 'fvalue',
 '

In [7]:
print('Parameters: ', results.params)
print('R2: ', results.rsquared)

Parameters:  [0.93259525 0.12386822 9.99820614]
R2:  0.9999895450933209


### OLS non-Linear curve but linear in parameters

In [12]:
nsample = 50000

sig=0.5
x = np.linspace(0,11, nsample)
X = np.column_stack((x, np.sin(x), (x-5)**2, np.ones(nsample)))
beta = np.array([0.22,0.3,0.43,5.0])
y_true=np.dot(X,beta)

In [13]:
y_true

array([15.75      , 15.7491684 , 15.74833685, ..., 22.59763522,
       22.59881905, 22.60000294])

In [16]:
y = y_true + sig*np.random.normal(size=nsample)

In [18]:
np.random.normal?


[0;31mDocstring:[0m
normal(loc=0.0, scale=1.0, size=None)

Draw random samples from a normal (Gaussian) distribution.

The probability density function of the normal distribution, first
derived by De Moivre and 200 years later by both Gauss and Laplace
independently [2]_, is often called the bell curve because of
its characteristic shape (see the example below).

The normal distributions occurs often in nature.  For example, it
describes the commonly occurring distribution of samples influenced
by a large number of tiny, random disturbances, each with its own
unique distribution [2]_.

.. note::
    New code should use the `~numpy.random.Generator.normal`
    method of a `~numpy.random.Generator` instance instead;
    please see the :ref:`random-quick-start`.

Parameters
----------
loc : float or array_like of floats
    Mean ("centre") of the distribution.
scale : float or array_like of floats
    Standard deviation (spread or "width") of the distribution. Must be
    non-negative.


In [20]:
res=sm.OLS(y,X).fit()
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.987
Model:                            OLS   Adj. R-squared:                  0.987
Method:                 Least Squares   F-statistic:                 1.274e+06
Date:                Sun, 15 Oct 2023   Prob (F-statistic):               0.00
Time:                        18:54:44   Log-Likelihood:                -36492.
No. Observations:               50000   AIC:                         7.299e+04
Df Residuals:                   49996   BIC:                         7.303e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.2196      0.001    280.866      0.0

In [23]:
res.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.987
Dependent Variable:,y,AIC:,72991.1292
Date:,2023-10-15 18:55,BIC:,73026.4083
No. Observations:,50000,Log-Likelihood:,-36492.0
Df Model:,3,F-statistic:,1274000.0
Df Residuals:,49996,Prob (F-statistic):,0.0
R-squared:,0.987,Scale:,0.25205

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
x1,0.2196,0.0008,280.8662,0.0000,0.2181,0.2212
x2,0.2957,0.0033,88.5538,0.0000,0.2892,0.3023
x3,0.4299,0.0003,1717.3482,0.0000,0.4294,0.4304
const,5.0060,0.0048,1039.6309,0.0000,4.9966,5.0154

0,1,2,3
Omnibus:,0.789,Durbin-Watson:,1.994
Prob(Omnibus):,0.674,Jarque-Bera (JB):,0.802
Skew:,0.002,Prob(JB):,0.67
Kurtosis:,2.981,Condition No.:,33.0
