In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
import statsmodels.api as sm

# Read data
diabetes_df = pd.read_csv('diabetes.csv')

# Univariate Analysis Option A
univariate_analysis = diabetes_df.describe().transpose()
univariate_analysis['mode'] = diabetes_df.mode().transpose().iloc[:, 0]
univariate_analysis['variance'] = diabetes_df.var().transpose()
univariate_analysis['skewness'] = diabetes_df.skew().transpose()
univariate_analysis['kurtosis'] = diabetes_df.kurtosis().transpose()
print('\nUnivariate Analysis Option A:')
print(univariate_analysis)


# Bivariate Analysis - Linear Regression Modeling
X_linear, y_linear = diabetes_df[['Glucose', 'BMI', 'Age']], diabetes_df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X_linear, y_linear, test_size=0.2, random_state=42)
linear_model = LinearRegression().fit(X_train, y_train)
mse_linear = mean_squared_error(y_test, linear_model.predict(X_test))
print(f"\nLinear Regression Mean Squared Error: {mse_linear:.2f}")

# Bivariate Analysis - Logistic Regression Modeling
logistic_model = LogisticRegression().fit(X_train, y_train)
accuracy_logistic = accuracy_score(y_test, logistic_model.predict(X_test))
print(f"\nLogistic Regression Accuracy: {accuracy_logistic:.2f}")

# Multiple Regression Analysis
X_multiple, y_multiple = diabetes_df.drop('Outcome', axis=1), diabetes_df['Outcome']
X_train_multiple, X_test_multiple, y_train_multiple, y_test_multiple = train_test_split(X_multiple, y_multiple, test_size=0.2, random_state=42)
X_train_multiple, X_test_multiple = sm.add_constant(X_train_multiple), sm.add_constant(X_test_multiple)
multiple_model = sm.OLS(y_train_multiple, X_train_multiple).fit()
mse_multiple = mean_squared_error(y_test_multiple, multiple_model.predict(X_test_multiple))
print('\nMultiple Regression Model:')
print(multiple_model.summary())
print(f'Mean Squared Error: {mse_multiple:.2f}')



Univariate Analysis Option A:
                          count        mean         std     min       25%  \
Pregnancies               768.0    3.845052    3.369578   0.000   1.00000   
Glucose                   768.0  120.894531   31.972618   0.000  99.00000   
BloodPressure             768.0   69.105469   19.355807   0.000  62.00000   
SkinThickness             768.0   20.536458   15.952218   0.000   0.00000   
Insulin                   768.0   79.799479  115.244002   0.000   0.00000   
BMI                       768.0   31.992578    7.884160   0.000  27.30000   
DiabetesPedigreeFunction  768.0    0.471876    0.331329   0.078   0.24375   
Age                       768.0   33.240885   11.760232  21.000  24.00000   
Outcome                   768.0    0.348958    0.476951   0.000   0.00000   

                               50%        75%     max    mode      variance  \
Pregnancies                 3.0000    6.00000   17.00   1.000     11.354056   
Glucose                   117.0000  140.