In [18]:
import seaborn as sns
import pandas as pd
import statsmodels.api as sm

In [19]:
# Load the dataset
mpg = sns.load_dataset('mpg')

In [20]:
# Drop rows with missing values
mpg_cleaned = mpg.dropna()

In [21]:
# Create the OLS regression model
independent_vars = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
X = mpg_cleaned[independent_vars]
X = sm.add_constant(X)  # Add a constant term for the intercept
y = mpg_cleaned['mpg']

model = sm.OLS(y, X).fit()

In [22]:
# Print the model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.809
Model:                            OLS   Adj. R-squared:                  0.806
Method:                 Least Squares   F-statistic:                     272.2
Date:                Thu, 12 Oct 2023   Prob (F-statistic):          3.79e-135
Time:                        22:29:49   Log-Likelihood:                -1036.5
No. Observations:                 392   AIC:                             2087.
Df Residuals:                     385   BIC:                             2115.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          -14.5353      4.764     -3.051   

In [23]:
# Extract the F-statistic and its p-value
f_statistic = model.fvalue
f_p_value = model.f_pvalue

# Extract the p-value for the intercept and 'cylinders'
intercept_p_value = model.pvalues['const']
cylinders_p_value = model.pvalues['cylinders']

# Check the statements
print(f"F-statistic p-value: {f_p_value}")
print(f"Intercept p-value: {intercept_p_value}")
print(f"Cylinders p-value: {cylinders_p_value}")

F-statistic p-value: 3.792336100722869e-135
Intercept p-value: 0.0024377410838650115
Cylinders p-value: 0.32121686784162057


In [25]:
# Get the residuals and degrees of freedom for residuals
residuals = model.resid
dof_residuals = model.df_resid

# Calculate the standard error of regression
std_error_regression = (sum(residuals ** 2) / dof_residuals) ** 0.5
std_error_regression

3.4352440033999767

In [None]:
# "model.scale" gives you the estimate of the variance of errors

In [36]:
# Extract the standard error of the regression
import numpy as np
std_error_regression = np.sqrt(model.scale)

print(f"Standard Error of Regression: {std_error_regression}")

Standard Error of Regression: 3.435244003399977


In [27]:
import statsmodels.api as sm

# Create the Lasso regression model
lasso_model = sm.OLS(y, X).fit_regularized(alpha=1, L1_wt=1)

# Extract the coefficients
coefficients = lasso_model.params

# Count the number of non-zero coefficients
num_variables_in_model = (coefficients != 0).sum()

print(f"Number of variables in the Lasso model: {num_variables_in_model}")


Number of variables in the Lasso model: 4


In [28]:
data = pd.read_csv('shared/framingham.csv')
# Check for missing values
data.dropna(inplace=True)

from sklearn.model_selection import train_test_split
# Split the data into features (X) and the target variable (y)
X = data.drop('TenYearCHD', axis=1)
y = data['TenYearCHD']

# Perform the train-test split with a 75%/25% ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)


In [29]:
# Add a constant for an intercept term
import statsmodels.api as sm

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

In [30]:
# Standard scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
# Create and Fit the Logistic Regression Model
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
logistic_model = LogisticRegression()

# Fit the model to the training data
logistic_model.fit(X_train, y_train)

In [32]:
# Calculate Accuracy on the Test Set

# Predict the target variable on the test set
y_pred = logistic_model.predict(X_test)

# Calculate accuracy using a threshold of 0.5
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy on the test set: {accuracy:.5f}") 
# {accuracy:.2f} 是一个用于放置浮点数（准确率）的占位符，:.2f 指定了数字的显示格式。在这里，:.2f 表示以两位小数的方式显示数字

Accuracy on the test set: 0.85558
