In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import pearsonr

# Load the data
data = pd.read_csv("Patient_Recovery.csv")

# Define the independent variable (Age) and the dependent variable (Recovery Time)
X_age = data[['Age (Years)']]
Y = data['Recovery Time (Days)']

# Calculate Pearson correlation coefficient
correlation_age, _ = pearsonr(X_age['Age (Years)'], Y)
print(f"Correlation coefficient (r) between Age and Recovery Time: {correlation_age:.4f}")

# Split the data into training and testing sets
X_age_train, X_age_test, Y_train, Y_test = train_test_split(X_age, Y, test_size=0.2, random_state=42)

# Add a constant to the independent variables matrix for the intercept in the training set
X_age_train_const = sm.add_constant(X_age_train)

# Fit the linear regression model
model_age = sm.OLS(Y_train, X_age_train_const).fit()

# Print model summary
print(model_age.summary())

# Extract coefficients
intercept_age = model_age.params['const']
b_age = model_age.params['Age (Years)']

# Print the equation of the best-fit regression line with four decimal places
print(f"Equation of the best-fit regression line: Y = {intercept_age:.4f} + {b_age:.4f}*Age")

Correlation coefficient (r) between Age and Recovery Time: 0.7036
                             OLS Regression Results                             
Dep. Variable:     Recovery Time (Days)   R-squared:                       0.483
Model:                              OLS   Adj. R-squared:                  0.482
Method:                   Least Squares   F-statistic:                     372.2
Date:                  Fri, 04 Apr 2025   Prob (F-statistic):           4.97e-59
Time:                          22:14:18   Log-Likelihood:                -1108.2
No. Observations:                   400   AIC:                             2220.
Df Residuals:                       398   BIC:                             2228.
Df Model:                             1                                         
Covariance Type:              nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import pearsonr

# Load the data
data = pd.read_csv("Patient_Recovery.csv")

# Define the independent variable (Physiotherapy Sessions) and the dependent variable (Recovery Time)
X_physio = data[['Physiotherapy Sessions']]
Y = data['Recovery Time (Days)']

# Calculate Pearson correlation coefficient
correlation_physio, _ = pearsonr(X_physio['Physiotherapy Sessions'], Y)
print(f"Correlation coefficient (r) between Physiotherapy Sessions and Recovery Time: {correlation_physio:.4f}")

# Split the data into training and testing sets
X_physio_train, X_physio_test, Y_train, Y_test = train_test_split(X_physio, Y, test_size=0.2, random_state=42)

# Add a constant to the independent variables matrix for the intercept in the training set
X_physio_train_const = sm.add_constant(X_physio_train)

# Fit the linear regression model
model_physio = sm.OLS(Y_train, X_physio_train_const).fit()

# Print model summary
print(model_physio.summary())

# Extract coefficients
intercept_physio = model_physio.params['const']
b_physio = model_physio.params['Physiotherapy Sessions']

# Print the equation of the best-fit regression line with four decimal places
print(f"Equation of the best-fit regression line: Y = {intercept_physio:.4f} + {b_physio:.4f}*Physiotherapy Sessions")

Correlation coefficient (r) between Physiotherapy Sessions and Recovery Time: -0.5902
                             OLS Regression Results                             
Dep. Variable:     Recovery Time (Days)   R-squared:                       0.360
Model:                              OLS   Adj. R-squared:                  0.359
Method:                   Least Squares   F-statistic:                     224.1
Date:                  Fri, 04 Apr 2025   Prob (F-statistic):           1.65e-40
Time:                          22:14:44   Log-Likelihood:                -1150.9
No. Observations:                   400   AIC:                             2306.
Df Residuals:                       398   BIC:                             2314.
Df Model:                             1                                         
Covariance Type:              nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
-------------

In [None]:
Task 1: Perform a Correlation Analysis

Compute the correlation coefficients (R) between age and recovery time. 
Compute the correlation coefficients (R) between physiotherapy sessions and recovery time.
Interpret the results and discuss which factor has a stronger correlation with recovery time

correlation coefficients (R) age =  0.7036
correlation coefficients (R) physiotherapy sessions = -0.5902

Upon analyzing the correlation coefficients, we observe a moderately strong positive correlation (r = 0.7036) between age and recovery time. This suggests that as age increases, recovery time tends to increase as well. In contrast, the correlation between the number of physiotherapy sessions and recovery time is moderately negative (r = -0.590). This implies that an increase in physiotherapy sessions is generally associated with a decrease in recovery time.

In [None]:
Task 2 Evaluate the Regression Model

Compute the R-squared (R²) value to assess how well the model explains the variation in recovery time. Discuss whether age or physiotherapy sessions has a stronger influence on recovery time.

r-squared using age = .483 or 48.3%
r-squared using physiotherapy sessions = .360 or 36.0%

Upon comparing the R-squared values of both predictors, the model demonstrates that age, with an R-squared of 48.3%, exhibits a stronger predictive power. This indicates that age is more effective in explaining the variability in recovery time compared to physiotherapy sessions, which has an R-squared of 36.0%.