In [8]:
import pandas as pd
import numpy as np
from scipy.stats import mode, skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score

# Load the Pima Indians Diabetes dataset from your local file
file_path = "diabetes.csv"  # Replace with the actual path to your downloaded dataset
names = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv(file_path, names=names)

# Ensure "Outcome" column is numeric (if it's not already)
df["Outcome"] = pd.to_numeric(df["Outcome"], errors="coerce")

# Check data types of other columns and convert them to numeric if necessary
numeric_columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors="coerce")

df = df.dropna()

# a. Univariate Analysis
# Frequency
frequency = df['Outcome'].value_counts()
print("Frequency:\n", frequency)

# Mean, Median, Mode, Variance, Standard Deviation, Skewness, and Kurtosis
mean = df.mean()
median = df.median()
variance = df.var()
std_deviation = df.std()
mode_result = mode(df['Outcome'])
skewness = skew(df)
kurt = kurtosis(df)

print("Mean:\n", mean)
print("Median:\n", median)
print("Mode:\n", mode_result)
print("Variance:\n", variance)
print("Standard Deviation:\n", std_deviation)
print("Skewness:\n", skewness)
print("Kurtosis:\n", kurt)





Frequency:
 0.0    500
1.0    268
Name: Outcome, dtype: int64
Mean:
 Pregnancies                   3.845052
Glucose                     120.894531
BloodPressure                69.105469
SkinThickness                20.536458
Insulin                      79.799479
BMI                          31.992578
DiabetesPedigreeFunction      0.471876
Age                          33.240885
Outcome                       0.348958
dtype: float64
Median:
 Pregnancies                   3.0000
Glucose                     117.0000
BloodPressure                72.0000
SkinThickness                23.0000
Insulin                      30.5000
BMI                          32.0000
DiabetesPedigreeFunction      0.3725
Age                          29.0000
Outcome                       0.0000
dtype: float64
Mode:
 ModeResult(mode=array([0.]), count=array([500]))
Variance:
 Pregnancies                    11.354056
Glucose                      1022.248314
BloodPressure                 374.647271
SkinThickness     

  mode_result = mode(df['Outcome'])
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# b. Bivariate Analysis
# Split data into features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Logistic Regression
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)

# Predictions
linear_predictions = linear_reg.predict(X_test)
logistic_predictions = logistic_reg.predict(X_test)

# Calculate Mean Squared Error for Linear Regression
mse_linear = mean_squared_error(y_test, linear_predictions)
print("Linear Regression MSE:", mse_linear)

# Calculate Accuracy for Logistic Regression
accuracy_logistic = accuracy_score(y_test, logistic_predictions)
print("Logistic Regression Accuracy:", accuracy_logistic)


Linear Regression MSE: 0.17104527280850096
Logistic Regression Accuracy: 0.7467532467532467


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# c. Multiple Regression Analysis
# For multiple regression analysis, you can include more predictor variables in the model and analyze their impact on the 'Outcome'.

# Example:
multiple_reg = LinearRegression()
multiple_reg.fit(X_train, y_train)
multiple_predictions = multiple_reg.predict(X_test)
mse_multiple = mean_squared_error(y_test, multiple_predictions)
print("Multiple Regression MSE:", mse_multiple)

Multiple Regression MSE: 0.17104527280850096


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from scipy.stats import skew, kurtosis

# Load the datasets (replace with the actual dataset files)
diabetes_data = pd.read_csv("diabetes_dataset.csv")
pima_data = pd.read_csv("pima_indians_diabetes_dataset.csv")

# Univariate analysis
def univariate_analysis(data):
    # Frequency for categorical variables
    frequency = data['categorical_column'].value_counts()

    # Mean, median, mode, variance, standard deviation, skewness, and kurtosis for numerical variables
    mean = data['numerical_column'].mean()
    median = data['numerical_column'].median()
    mode_value, _ = mode(data['numerical_column'])
    variance = data['numerical_column'].var()
    std_deviation = data['numerical_column'].std()
    skewness = skew(data['numerical_column'])
    kurt = kurtosis(data['numerical_column'])

    # Display or store these statistics as needed
    print("Frequency:")
    print(frequency)
    print("\nMean:", mean)
    print("Median:", median)
    print("Mode:", mode_value[0])
    print("Variance:", variance)
    print("Standard Deviation:", std_deviation)
    print("Skewness:", skewness)
    print("Kurtosis:", kurt)

# Bivariate analysis - Linear Regression
def linear_regression_analysis(data):
    X = data[['feature1', 'feature2', ...]]  # Replace with actual feature names
    y = data['target']
    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)

    # Display or store regression results
    print("Linear Regression MSE:", mse)

# Bivariate analysis - Logistic Regression
def logistic_regression_analysis(data):
    X = data[['feature1', 'feature2', ...]]  # Replace with actual feature names
    y = data['target']
    model = LogisticRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)

    # Display or store logistic regression results
    print("Logistic Regression Accuracy:", accuracy)

# Call the univariate analysis functions for both datasets
univariate_analysis(diabetes_data)
univariate_analysis(pima_data)

# Call the linear regression analysis for both datasets
linear_regression_analysis(diabetes_data)
linear_regression_analysis(pima_data)

# Call the logistic regression analysis for both datasets
logistic_regression_analysis(diabetes_data)
logistic_regression_analysis(pima_data)
