In [1]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [2]:
num_new_points = 15
X = np.random.uniform(0, 6, num_new_points).reshape(-1, 1)  # Generate random X values between 0 and 6
Z = np.random.uniform(0, 6, num_new_points).reshape(-1, 1)  # Generate random Z values between 0 and 6
noise = np.random.normal(0, 1, num_new_points)  # Generate random noise
Y = 2 * X.flatten() + 3 * Z.flatten() + noise

df = pd.DataFrame({'X': X.flatten(), 'Z': Z.flatten(), 'Y': Y})

In [3]:
df

Unnamed: 0,X,Z,Y
0,4.553294,0.621908,9.786513
1,5.009172,2.267599,17.993086
2,1.548428,3.478965,12.861615
3,1.232845,5.215138,18.900885
4,3.838659,5.644876,23.435682
5,0.94091,3.281106,12.148624
6,1.310662,2.781287,11.632071
7,5.015414,0.080522,10.130221
8,3.919257,4.443335,21.64789
9,0.916384,1.273522,6.623397


In [4]:
df['division'] = df['X'] / df['Y']
df = df[df['division'] > 0.1]
df

Unnamed: 0,X,Z,Y,division
0,4.553294,0.621908,9.786513,0.465262
1,5.009172,2.267599,17.993086,0.278394
2,1.548428,3.478965,12.861615,0.120391
4,3.838659,5.644876,23.435682,0.163796
6,1.310662,2.781287,11.632071,0.112677
7,5.015414,0.080522,10.130221,0.495094
8,3.919257,4.443335,21.64789,0.181046
9,0.916384,1.273522,6.623397,0.138356
10,4.038059,1.221254,11.573477,0.348906
11,3.095342,3.372828,15.769964,0.196281


In [5]:
np.mean(df['division'])

0.2630426515701125

In [6]:
min(df['division'])

0.11267661976305744

In [7]:
correlation, _ = pearsonr(df['X'], df['Y'])
print(f"Correlation coefficient: {correlation}")

Correlation coefficient: 0.3108467296692851


In [8]:
# Regression Analysis
X = sm.add_constant(df['X'])  # adding a constant
model = sm.OLS(df['Y'], X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     1.070
Date:                Sat, 02 Dec 2023   Prob (F-statistic):              0.325
Time:                        23:04:15   Log-Likelihood:                -36.033
No. Observations:                  12   AIC:                             76.07
Df Residuals:                      10   BIC:                             77.04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.3841      4.147      2.504      0.0



In [9]:
# Split the data into training and testing sets
X = df[['X', 'Z']]
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# The coefficients
print('Coefficient (Slope): \n', model.coef_)
print('Intercept: \n', model.intercept_)

# Example of predicting a new value
print('Predicted value for 6: \n', model.predict([[4.303745, 4.716749]]))

# Make predictions
y_pred = model.predict(X_test)

Coefficient (Slope): 
 [1.83542252 2.90342616]
Intercept: 
 0.7620470931657763
Predicted value for 6: 
 [22.35597002]




In [10]:
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error (MAE)
mae

0.7185655847663828

In [11]:
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error (RMSE)
mse

0.6056800145274553

In [12]:
rmse = np.sqrt(mse)  # Root Mean Squared Error (RMSE)
rmse

0.778254466435918

In [13]:
residuals = y_test - y_pred
residuals

13    0.551181
11   -0.466100
0    -1.138416
Name: Y, dtype: float64

In [14]:
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
mse_cv = -scores.mean()  # Average (Meab Squared Error) MSE across cross-validation folds, evaluate its generalization ability and assess its stability.
mse_cv

0.9940335549655067