In [18]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [19]:
num_new_points = 15
X = np.random.uniform(0, 6, num_new_points).reshape(-1, 1)  # Generate random X values between 0 and 6
Z = np.random.uniform(0, 6, num_new_points).reshape(-1, 1)  # Generate random Z values between 0 and 6
noise = np.random.normal(0, 1, num_new_points)  # Generate random noise
Y = 2 * X.flatten() + 3 * Z.flatten() + noise

df = pd.DataFrame({'X': X.flatten(), 'Z': Z.flatten(), 'Y': Y})

In [20]:
df

Unnamed: 0,X,Z,Y
0,1.51285,2.515056,10.189911
1,2.757271,2.549653,13.489698
2,1.682384,2.655581,11.206149
3,5.095929,5.277357,25.281034
4,3.261318,0.267776,8.238199
5,5.281695,2.23731,15.463476
6,2.203893,4.670366,19.44598
7,3.971799,4.021891,17.781496
8,2.732847,4.948777,21.089824
9,1.945122,4.853818,18.550929


In [21]:
df['division'] = df['X'] / df['Y']
df = df[df['division'] > 0.1]
df

Unnamed: 0,X,Z,Y,division
0,1.51285,2.515056,10.189911,0.148465
1,2.757271,2.549653,13.489698,0.204398
2,1.682384,2.655581,11.206149,0.15013
3,5.095929,5.277357,25.281034,0.201571
4,3.261318,0.267776,8.238199,0.395878
5,5.281695,2.23731,15.463476,0.341559
6,2.203893,4.670366,19.44598,0.113334
7,3.971799,4.021891,17.781496,0.223367
8,2.732847,4.948777,21.089824,0.129581
9,1.945122,4.853818,18.550929,0.104853


In [22]:
np.mean(df['division'])

0.21247213934187034

In [23]:
min(df['division'])

0.10150099716600886

In [24]:
df['division'].min()

0.10150099716600886

In [25]:
df['division'].std()

0.11219469355498997

In [26]:
df['division'].var()

0.012587649261898107

In [27]:
correlation, _ = pearsonr(df['X'], df['Y'])
print(f"Correlation coefficient: {correlation}")

Correlation coefficient: 0.5328682490570033


In [28]:
# Regression Analysis
X = sm.add_constant(df['X'])  # adding a constant
model = sm.OLS(df['Y'], X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.284
Model:                            OLS   Adj. R-squared:                  0.219
Method:                 Least Squares   F-statistic:                     4.362
Date:                Sat, 02 Dec 2023   Prob (F-statistic):             0.0608
Time:                        12:48:58   Log-Likelihood:                -39.527
No. Observations:                  13   AIC:                             83.05
Df Residuals:                      11   BIC:                             84.18
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.8124      3.556      2.478      0.0



In [29]:
# Split the data into training and testing sets
X = df[['X', 'Z']]
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# The coefficients
print('Coefficient (Slope): \n', model.coef_)
print('Intercept: \n', model.intercept_)

# Example of predicting a new value
print('Predicted value for 6: \n', model.predict([[4.303745, 4.716749]]))

# Make predictions
y_pred = model.predict(X_test)

Coefficient (Slope): 
 [1.66350452 3.08308919]
Intercept: 
 0.7324073727764517
Predicted value for 6: 
 [22.43386445]




In [30]:
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error (MAE)
mae

0.7831122560155332

In [31]:
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error (RMSE)
mse

0.7130918894955256

In [32]:
rmse = np.sqrt(mse)  # Root Mean Squared Error (RMSE)
rmse

0.8444476831015203

In [33]:
residuals = y_test - y_pred
residuals

13    1.154113
9    -0.381952
0    -0.813272
Name: Y, dtype: float64

In [34]:
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
mse_cv = -scores.mean()  # Average (Meab Squared Error) MSE across cross-validation folds, evaluate its generalization ability and assess its stability.
mse_cv

1.2530306283155324