Imports

In [None]:
# Prerequisites - Install packages
# !pip install pandas matplotlib seaborn scikit-learn numpy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import warnings

# Set up matplotlib style similar to ggplot2
plt.style.use('seaborn-v0_8')
warnings.filterwarnings('ignore')

Create Dataset

In [None]:
np.random.seed(42)  # for reproducibility
n = 30
x = np.linspace(1, 10, n)
y = 4.2 + 2.05 * x + np.random.normal(0, 2, n)
sim1 = pd.DataFrame({'x': x, 'y': y})

# Plot the data (equivalent to sim1 %>% ggplot(aes(x, y)) + geom_point())
plt.figure(figsize=(8, 6))
plt.scatter(sim1['x'], sim1['y'], alpha=0.7)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter plot of sim1 data')
plt.grid(True, alpha=0.3)
plt.show()

Linear Regression

In [None]:
# Fit linear model
X = sim1[['x']]  # sklearn expects 2D array for features
y_target = sim1['y']

sim1_model = LinearRegression()
sim1_model.fit(X, y_target)

# Get coefficients
intercept = sim1_model.intercept_  # Where it intercepts the Y axis
slope = sim1_model.coef_[0]        # Slope steepness (1 across = Coef up)

print(f"\nModel Coefficients:")
print(f"(Intercept): {intercept:.6f}")
print(f"x:           {slope:.6f}")

Plotting

In [None]:
# Plot with regression line
plt.figure(figsize=(8, 6))
plt.scatter(sim1['x'], sim1['y'], alpha=0.7, label='Data points')
plt.plot(sim1['x'], intercept + slope * sim1['x'], 'red', linewidth=2, label='Regression line')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Linear regression fit')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Add Predicted Values and Residuals to table

In [None]:
# going to add straight to the original data set for ease.
sim1['prediction'] = sim1_model.predict(sim1[['x']])  # predict from the x value
sim1['residual'] = sim1['y'] - sim1['prediction']
print(sim1.head())

Plot with residuals.

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(sim1['x'], sim1['y'], alpha=0.7, label='Data points')
plt.plot(sim1['x'], intercept + slope * sim1['x'], 'red', linewidth=2, label='Regression line')
plt.scatter(sim1['x'], sim1['residual'], color='green', s=20, label='Residuals')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Linear regression fit with Residuals')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Trend the residuals!

In [None]:
residual_X = sim1[['x']]
residual_y_target = sim1['residual']
residual_model = LinearRegression()
residual_model.fit(residual_X, residual_y_target)
residual_intercept = residual_model.intercept_
residual_slope = residual_model.coef_[0]

# and plot again
plt.figure(figsize=(8, 6))
plt.scatter(sim1['x'], sim1['y'], alpha=0.7, label='Data points')
plt.plot(sim1['x'], intercept + slope * sim1['x'], 'red', linewidth=2, label='Regression line')
plt.scatter(sim1['x'], sim1['residual'], color='green', s=20, label='Residuals')
plt.plot(sim1['x'], residual_intercept + residual_slope * sim1['x'], 'black', linewidth=2, label='Residual Trend line')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Linear regression fit with Residuals and trend')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Additional Metrics

In [None]:
# Additional analysis - model performance metrics
r2_score = sim1_model.score(X, y_target)
mse = mean_squared_error(y_target, sim1['prediction'])
rmse = np.sqrt(mse)

print(f"\nModel Performance:")
print(f"R-squared: {r2_score:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")

**Polynomial Regression and Logistic Regression Examples**

In [None]:
"""Create Datasets"""

np.random.seed(42)  # for reproducibility
n = 50

# Dataset for Polynomial Regression (quadratic relationship) y = 0.5*x^2 - 2*x + 1 + noise
x_poly = np.linspace(-3, 3, n)
y_poly = 0.5 * x_poly**2 - 2 * x_poly + 1 + np.random.normal(0, 0.8, n)
poly_data = pd.DataFrame({'x': x_poly, 'y': y_poly})

# Dataset for Logistic Regression (S-shaped relationship) create probability using sigmoid function, then convert to binary outcomes, plus randomness
x_logistic = np.linspace(-5, 5, n)
linear_combination = 1.5 * x_logistic - 0.2
probabilities = 1 / (1 + np.exp(-linear_combination))
y_logistic_binary = np.random.binomial(1, probabilities, n)
logistic_data = pd.DataFrame({'x': x_logistic, 'y_binary': y_logistic_binary, 'y_prob': probabilities})

print("Datasets created:")
print(f"Polynomial data shape: {poly_data.shape}")
print(f"Logistic data shape: {logistic_data.shape}")

In [None]:
"""Polynomial Regression Analysis"""

# Fit polynomial regression (degree 2)
poly_features = PolynomialFeatures(degree=2)
X_poly = poly_data[['x']]
X_poly_transformed = poly_features.fit_transform(X_poly)
poly_model = LinearRegression()
poly_model.fit(X_poly_transformed, poly_data['y'])

# Generate smooth curve for plotting
x_smooth = np.linspace(-3, 3, 100)
X_smooth = poly_features.transform(x_smooth.reshape(-1, 1))
y_smooth_pred = poly_model.predict(X_smooth)

# Plot with polynomial regression curve
plt.subplot(1, 2, 2)
plt.scatter(poly_data['x'], poly_data['y'], alpha=0.7, color='blue', label='Data points')
plt.plot(x_smooth, y_smooth_pred, 'red', linewidth=2, label='Polynomial fit (degree 2)')
plt.scatter(poly_data['x'], poly_data['residual'], color='purple', s=20, alpha=0.7, label='Residuals')
plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Polynomial Regression Fit')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Add predictions and residuals to polynomial data
poly_data['prediction'] = poly_model.predict(X_poly_transformed)
poly_data['residual'] = poly_data['y'] - poly_data['prediction']

# Get polynomial coefficients
coefficients = poly_model.coef_
intercept = poly_model.intercept_

# Model performance metrics for polynomial
r2_poly = poly_model.score(X_poly_transformed, poly_data['y'])
mse_poly = mean_squared_error(poly_data['y'], poly_data['prediction'])
rmse_poly = np.sqrt(mse_poly)

print(f"\nPolynomial Model Performance:")
print(f"R-squared: {r2_poly:.4f}")
print(f"MSE: {mse_poly:.4f}")
print(f"RMSE: {rmse_poly:.4f}")

In [None]:
"""Logistic Regression Analysis"""

# Fit logistic regression
X_logistic = logistic_data[['x']]
y_logistic = logistic_data['y_binary']
logistic_model = LogisticRegression()
logistic_model.fit(X_logistic, y_logistic)

# Generate smooth S-curve for plotting
x_smooth_log = np.linspace(-5, 5, 100)
X_smooth_log = x_smooth_log.reshape(-1, 1)
y_prob_pred = logistic_model.predict_proba(X_smooth_log)[:, 1]  # Probability of class 1

# Plot with logistic regression S-curve
plt.subplot(1, 2, 2)
plt.scatter(logistic_data['x'], logistic_data['y_binary'], alpha=0.7, color='green', label='Binary outcomes')
plt.plot(x_smooth_log, y_prob_pred, 'red', linewidth=2, label='Logistic regression (S-curve)')
plt.plot(logistic_data['x'], logistic_data['y_prob'], 'orange', alpha=0.5, linewidth=1, label='True probabilities')
plt.xlabel('x')
plt.ylabel('Probability / Binary Outcome')
plt.title('Logistic Regression Fit')
plt.legend()
plt.grid(True, alpha=0.3)
plt.ylim(-0.1, 1.1)
plt.tight_layout()
plt.show()

# Add predictions to logistic data
logistic_data['prediction_binary'] = logistic_model.predict(X_logistic)
logistic_data['prediction_prob'] = logistic_model.predict_proba(X_logistic)[:, 1]
log_coefficients = logistic_model.coef_[0]
log_intercept = logistic_model.intercept_[0]

# Model performance metrics for logistic regression
accuracy = accuracy_score(y_logistic, logistic_data['prediction_binary'])

print(f"\nLogistic Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Classification Report:")
print(classification_report(y_logistic, logistic_data['prediction_binary']))

In [None]:
"""Combined Visualization"""
# Create a comprehensive plot showing both regression types
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Polynomial - With fit and residuals
axes[0].scatter(poly_data['x'], poly_data['y'], alpha=0.7, color='blue', label='Data points')
axes[0].plot(x_smooth, y_smooth_pred, 'red', linewidth=2, label='Polynomial fit')
axes[0].scatter(poly_data['x'], poly_data['residual'], color='purple', s=20, alpha=0.7, label='Residuals')
axes[0].axhline(y=0, color='black', linestyle='--', alpha=0.5)
axes[0].set_xlabel('x')
axes[0].set_ylabel('y')
axes[0].set_title('Polynomial Regression with Residuals')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Logistic - With S-curve fit
axes[1].scatter(logistic_data['x'], logistic_data['y_binary'], alpha=0.7, color='green', label='Binary outcomes')
axes[1].plot(x_smooth_log, y_prob_pred, 'red', linewidth=2, label='Logistic S-curve')
axes[1].scatter(logistic_data['x'], logistic_data['prediction_prob'], color='orange', s=20, alpha=0.7, label='Predicted probabilities')
axes[1].set_xlabel('x')
axes[1].set_ylabel('Probability / Binary Outcome')
axes[1].set_title('Logistic Regression with Predictions')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim(-0.1, 1.1)

plt.tight_layout()
plt.show()

In [None]:
"""Data Summary"""
print()
print(f"\nPolynomial Data Statistics:")
print(f"X range: [{poly_data['x'].min():.2f}, {poly_data['x'].max():.2f}]")
print(f"Y range: [{poly_data['y'].min():.2f}, {poly_data['y'].max():.2f}]")
print(f"Mean residual: {poly_data['residual'].mean():.6f}")

print(f"\nLogistic Data Statistics:")
print(f"X range: [{logistic_data['x'].min():.2f}, {logistic_data['x'].max():.2f}]")
print(f"Positive class ratio: {logistic_data['y_binary'].mean():.3f}")
print(f"Prediction accuracy: {accuracy:.3f}")