# Multiple Regression for independent samples, normal and homoscedastic

### First - Import the libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import bartlett
import seaborn as sns
import matplotlib.pyplot as plt
from   sklearn.linear_model import LinearRegression
from   sklearn.metrics import r2_score
import statsmodels.api as sm
import os

### Second - import the data

In [2]:
df = pd.read_csv('data_Normal_homo.csv')
print (df)

       A    B    C    D    E
0    177  238  291  476  488
1    160  247  303  438  459
2    190  232  314  347  517
3    165  220  391  367  435
4     96  226  344  398  548
..   ...  ...  ...  ...  ...
195   94  277  285  339  444
196  227  231  411  331  557
197  149  197  414  460  504
198  152  215  430  421  436
199  194  221  355  360  510

[200 rows x 5 columns]


### Third - Test the Normality of samples

In [3]:
a = df['A']
b = df['B']
c = df['C']
d = df['D']
e = df['E']

### Column A

In [4]:
average_a = np.mean(a)
print(average_a)
standard_deviation_a = np.std(a-1)
print(standard_deviation_a)

158.905
42.931410121262026


In [5]:
test_value_a, p_value_a = stats.kstest(a, cdf='norm', args=(average_a, standard_deviation_a), N=len(a))
print(test_value_a)
print(p_value_a)

0.08334989263729675
0.11732711556732545


In [6]:
if p_value_a > 0.05:
    print("The test presented with 95% confidence that the data behaved like a normal distribution.")
else:
    print("The test presented with 95% confidence that the data did not behave like a normal distribution.")

The test presented with 95% confidence that the data behaved like a normal distribution.


### Column B

In [7]:
average_b = np.mean(b)
print(average_b)
standard_deviation_b = np.std(b-1)
print(standard_deviation_b)

252.38
43.26032362338498


In [8]:
test_value_b, p_value_b = stats.kstest(b, cdf='norm', args=(average_b, standard_deviation_b), N=len(b))
print(test_value_b)
print(p_value_b)

0.07902472970402907
0.15587170398301164


In [9]:
if p_value_b > 0.05:
    print("The test presented with 95% confidence that the data behaved like a normal distribution.")
else:
    print("The test presented with 95% confidence that the data did not behave like a normal distribution.")

The test presented with 95% confidence that the data behaved like a normal distribution.


### Column C

In [10]:
average_c = np.mean(c)
print(average_c)
standard_deviation_c = np.std(c-1)
print(standard_deviation_c)

351.995
43.71458538062553


In [11]:
test_value_c, p_value_c = stats.kstest(c, cdf='norm', args=(average_c, standard_deviation_c), N=len(c))
print(test_value_c)
print(p_value_c)

0.08404504315390282
0.1119313806983605


In [12]:
if p_value_c > 0.05:
    print("The test presented with 95% confidence that the data behaved like a normal distribution.")
else:
    print("The test presented with 95% confidence that the data did not behave like a normal distribution.")

The test presented with 95% confidence that the data behaved like a normal distribution.


### Column D

In [13]:
average_d = np.mean(d)
print(average_d)
standard_deviation_d = np.std(d-1)
print(standard_deviation_d)

402.585
43.83951157346532


In [14]:
test_value_d, p_value_d = stats.kstest(d, cdf='norm', args=(average_d, standard_deviation_d), N=len(d))
print(test_value_d)
print(p_value_d)

0.08438179645507876
0.10939198635063863


In [15]:
if p_value_d > 0.05:
    print("The test presented with 95% confidence that the data behaved like a normal distribution.")
else:
    print("The test presented with 95% confidence that the data did not behave like a normal distribution.")

The test presented with 95% confidence that the data behaved like a normal distribution.


### Column E

In [16]:
average_e = np.mean(e)
print(average_e)
standard_deviation_e = np.std(e-1)
print(standard_deviation_e)

494.89
45.730273342721226


In [17]:
test_value_e, p_value_e = stats.kstest(e, cdf='norm', args=(average_e, standard_deviation_e), N=len(e))
print(test_value_e)
print(p_value_e)

0.08720806603206421
0.08990084345256166


In [18]:
if p_value_e > 0.05:
    print("The test presented with 95% confidence that the data behaved like a normal distribution.")
else:
    print("The test presented with 95% confidence that the data did not behave like a normal distribution.")

The test presented with 95% confidence that the data behaved like a normal distribution.


### Fourth - Do the Levene test to Homoscedasticity

In [19]:
res = stats.levene(a, b, c, d, e)
print ("The test value Wilcoxon to samples related = " + str(res.statistic))
print ("The p_value Wilcoxon to samples related = " + str(res.pvalue))

The test value Wilcoxon to samples related = 0.23880235324308738
The p_value Wilcoxon to samples related = 0.9164357490741767


In [20]:
if res.pvalue > 0.05:
    print("the samples present a homogeneous distribution in their variances.")
else:
    print("the samples do not present a homogeneous distribution in their variances.")

the samples present a homogeneous distribution in their variances.


### Fifth - Simple Regression 

In [23]:
X = df.drop(['A'], axis=1)
y = df['A'].values.reshape(-1,1)


reg = LinearRegression()
reg.fit(X, y)


print("Influence of the other variables on the first: A = {:.5} + {:.5}*B + {:.5}*C + {:.5}*D + {:.5}*E".format(reg.intercept_[0], reg.coef_[0][0], reg.coef_[0][1], reg.coef_[0][2], reg.coef_[0][3]))

influence of the other variables on the first: A = 143.34 + -0.05167*B + -0.035858*C + 0.14305*D + -0.033071*E


### Evaluating the accuracy of the model

In [24]:
X = np.column_stack((df['B'], df['C'], df['D'], df['E']))
y = df['A']


X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()


print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      A   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     1.489
Date:                Fri, 19 Aug 2022   Prob (F-statistic):              0.207
Time:                        17:40:18   Log-Likelihood:                -1032.7
No. Observations:                 200   AIC:                             2075.
Df Residuals:                     195   BIC:                             2092.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        143.3442     57.891      2.476      0.0