In [None]:
# Q1. Simple Linear Regression vs Multiple Linear Regression
# Simple Linear Regression: Predicts a dependent variable using a single independent variable.
# Multiple Linear Regression: Predicts a dependent variable using two or more independent variables.

# Example of Simple Linear Regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Simple Linear Regression Example
# Independent variable (X) and Dependent variable (y)
X = np.array([[1], [2], [3], [4], [5]])  # feature (e.g., years of experience)
y = np.array([1, 2, 3, 4, 5])            # target variable (e.g., salary)

# Create a simple linear regression model and fit it
model = LinearRegression()
model.fit(X, y)

# Predict values
y_pred = model.predict(X)

# Plotting the data points and the regression line
plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red')
plt.title('Simple Linear Regression')
plt.xlabel('Independent Variable (X)')
plt.ylabel('Dependent Variable (y)')
plt.show()

# Example of Multiple Linear Regression
from sklearn.datasets import make_regression

# Generating a dataset for multiple regression (2 features)
X_multi, y_multi = make_regression(n_samples=100, n_features=2, noise=0.1)

# Fit a multiple linear regression model
model_multi = LinearRegression()
model_multi.fit(X_multi, y_multi)

# Predictions
y_multi_pred = model_multi.predict(X_multi)

# Q2. Assumptions of Linear Regression:
# 1. Linearity: The relationship between independent and dependent variables should be linear.
# 2. Independence: Observations should be independent of each other.
# 3. Homoscedasticity: Constant variance of errors.
# 4. Normality: Residuals should be normally distributed.

# Check assumptions
import statsmodels.api as sm
import seaborn as sns

# Fit a linear model
X_with_const = sm.add_constant(X)
model_stats = sm.OLS(y, X_with_const).fit()

# Check for residuals' normality using histogram
residuals = model_stats.resid
sns.histplot(residuals, kde=True)
plt.title('Residuals Normality')
plt.show()

# Q3. Slope and Intercept Interpretation
# The slope represents the change in the dependent variable for each unit change in the independent variable.
# The intercept is the expected value of the dependent variable when the independent variable is zero.

# For a simple linear regression model y = mx + c:
# m = slope, c = intercept
# Example: y = 2x + 1
# If x = 3, y = 2*3 + 1 = 7 (The slope tells us that for each increase of 1 in x, y increases by 2)

# Q4. Gradient Descent
# Gradient descent is an optimization algorithm used to minimize the cost function in machine learning models.

# Example: For linear regression, we use gradient descent to minimize the Mean Squared Error (MSE) to find the optimal coefficients.
def gradient_descent(X, y, learning_rate=0.01, epochs=1000):
    m, c = 0, 0  # initial guess for slope and intercept
    n = len(X)

    for _ in range(epochs):
        y_pred = m * X + c
        cost = (1/n) * sum((y_pred - y) ** 2)

        # Calculate gradients
        m_gradient = (2/n) * sum((y_pred - y) * X)
        c_gradient = (2/n) * sum(y_pred - y)

        # Update parameters
        m -= learning_rate * m_gradient
        c -= learning_rate * c_gradient

    return m, c

# Using gradient descent to optimize for m and c
X_gradient = np.array([1, 2, 3, 4, 5])
y_gradient = np.array([1, 2, 3, 4, 5])
m, c = gradient_descent(X_gradient, y_gradient)
print(f"Optimized m (slope): {m}, c (intercept): {c}")

# Q5. Multiple Linear Regression Model
# Multiple linear regression predicts a dependent variable using more than one independent variable.
# The model is: y = b0 + b1*x1 + b2*x2 + ... + bn*xn
# This is a generalization of simple linear regression, where more than one feature is involved.

# Q6. Multicollinearity in Multiple Linear Regression
# Multicollinearity occurs when independent variables in the model are highly correlated with each other.
# This can make the model's coefficients unstable and difficult to interpret.

# Detecting multicollinearity using VIF (Variance Inflation Factor)
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_multi_with_const = sm.add_constant(X_multi)
vif = pd.DataFrame()
vif["Feature"] = range(X_multi_with_const.shape[1])
vif["VIF"] = [variance_inflation_factor(X_multi_with_const.values, i) for i in range(X_multi_with_const.shape[1])]
print(vif)

# Addressing multicollinearity: Drop one of the correlated features or apply dimensionality reduction techniques like PCA.

# Q7. Polynomial Regression
# Polynomial regression fits a nonlinear relationship between the independent variable(s) and the dependent variable.
# It's an extension of linear regression, but uses polynomial terms of the independent variables.

from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

# Fit polynomial regression model
model_poly = LinearRegression()
model_poly.fit(X_poly, y)

# Q8. Advantages and Disadvantages of Polynomial Regression
# Advantages:
# - Can model nonlinear relationships.
# - Can capture more complex patterns in the data.

# Disadvantages:
# - Overfitting: If the degree of the polynomial is too high, the model can fit noise in the data.
# - More computationally expensive.

# When to use polynomial regression:
# - When there is a known nonlinear relationship between variables and simple linear regression is insufficient.

# Example use case: Predicting house prices based on square footage, where the relationship between price and size is nonlinear.
