In [15]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
import statsmodels.api as sm

from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [16]:
num_new_points = 15
X = np.random.uniform(0, 6, num_new_points).reshape(-1, 1)  # Generate random X values between 0 and 6
Z = np.random.uniform(0, 6, num_new_points).reshape(-1, 1)  # Generate random Z values between 0 and 6
noise = np.random.normal(0, 1, num_new_points)  # Generate random noise
Y = 2 * X.flatten() + 3 * Z.flatten() + noise

df = pd.DataFrame({'X': X.flatten(), 'Z': Z.flatten(), 'Y': Y})

In [17]:
df

Unnamed: 0,X,Z,Y
0,1.469825,2.568025,11.871448
1,1.054233,4.379349,13.862143
2,2.663415,0.844644,9.651096
3,5.150989,3.044877,20.273056
4,5.425958,0.280545,10.763104
5,2.314765,0.777337,8.613171
6,5.956077,4.812495,25.382074
7,3.975448,5.658878,24.605853
8,1.786268,0.907559,6.35075
9,5.537101,4.409263,24.441184


In [18]:
df['division'] = df['X'] / df['Y']
df = df[df['division'] > 0.1]
df

Unnamed: 0,X,Z,Y,division
0,1.469825,2.568025,11.871448,0.123812
2,2.663415,0.844644,9.651096,0.27597
3,5.150989,3.044877,20.273056,0.254081
4,5.425958,0.280545,10.763104,0.504126
5,2.314765,0.777337,8.613171,0.268747
6,5.956077,4.812495,25.382074,0.234657
7,3.975448,5.658878,24.605853,0.161565
8,1.786268,0.907559,6.35075,0.281269
9,5.537101,4.409263,24.441184,0.226548
13,5.06269,3.127835,19.12042,0.264779


In [19]:
np.mean(df['division'])

0.25955534262559626

In [20]:
min(df['division'])

0.12381180933369132

In [21]:
df['division'].min()

0.12381180933369132

In [22]:
df['division'].std()

0.10024146851803395

In [23]:
df['division'].var()

0.010048352010651991

In [24]:
correlation, _ = pearsonr(df['X'], df['Y'])
print(f"Correlation coefficient: {correlation}")

Correlation coefficient: 0.7396039849573846


In [25]:
# Regression Analysis
X = sm.add_constant(df['X'])  # adding a constant
model = sm.OLS(df['Y'], X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.547
Model:                            OLS   Adj. R-squared:                  0.490
Method:                 Least Squares   F-statistic:                     9.661
Date:                Sun, 03 Dec 2023   Prob (F-statistic):             0.0145
Time:                        18:07:14   Log-Likelihood:                -29.726
No. Observations:                  10   AIC:                             63.45
Df Residuals:                       8   BIC:                             64.06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.5630      4.369      0.816      0.4



In [26]:
# Split the data into training and testing sets
X = df[['X', 'Z']]
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# The coefficients
print('Coefficient (Slope): \n', model.coef_)
print('Intercept: \n', model.intercept_)

# Example of predicting a new value
print('Predicted value for 6: \n', model.predict([[4.303745, 4.716749]]))

# Make predictions
y_pred = model.predict(X_test)

Coefficient (Slope): 
 [1.62784877 2.97471882]
Intercept: 
 1.6610143424551431
Predicted value for 6: 
 [22.6978624]




In [27]:
r_squared = r2_score(y_test, y_pred) # Coefficient of Determination, R-squared value
r_squared

0.984212460024564

In [28]:
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error (MAE)
mae

0.8960779010123678

In [29]:
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error (RMSE)
mse

0.8633680494498952

Advantages of MSE:

Sensitivity to Errors: MSE gives more weight to large errors because it squares the differences between predicted and actual values. This means that it is particularly sensitive to outliers and can penalize them more heavily. In some cases, this is desirable as it can help the model focus on reducing large errors.

Disadvantages of MSE:

Units of Measurement: The units of MSE are the square of the units of the dependent variable, which can make it less interpretable. For example, if you're predicting house prices in dollars, MSE will be in square dollars, which doesn't have a direct interpretation.

Sensitivity to Outliers: While the sensitivity to outliers can be an advantage in some cases, it can also be a disadvantage. Outliers can have a disproportionate impact on MSE, and if your dataset contains many outliers, it might not provide a representative measure of overall model performance.


Advantages of MAE:

Robustness to Outliers: MAE is less sensitive to outliers compared to MSE because it doesn't square the errors. This makes it a more robust metric when your dataset contains significant outliers.

Interpretability: MAE has a more straightforward interpretation because it gives the average absolute error in the same units as the dependent variable. This makes it easier to explain to non-technical stakeholders.

Disadvantages of MAE:

Less Emphasis on Large Errors: MAE treats all errors, whether large or small, with equal weight. In cases where you want to focus more on reducing large errors, this may not be the most appropriate metric.

Less Information on Outliers: While MAE is less sensitive to outliers, it provides less information about the distribution of errors compared to MSE. This might be a disadvantage if you need to understand the nature of errors in your model.

In [30]:
rmse = np.sqrt(mse)  # Root Mean Squared Error (RMSE)
rmse

0.9291760056361201

In [31]:
residuals = y_test - y_pred
residuals

9    0.650288
2    1.141867
Name: Y, dtype: float64

In [32]:
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
mse_cv = -scores.mean()  # Average (Meab Squared Error) MSE across cross-validation folds, evaluate its generalization ability and assess its stability.
mse_cv

0.9850336881128223