# Explore here

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load data
total_data = pd.read_csv(
    "https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv", sep=","
)
print("Initial Data Preview:")
print(total_data.head())  # Print initial data preview

# Drop duplicates
total_data = total_data.drop_duplicates().reset_index(drop=True)
print("\nData After Removing Duplicates:")
print(total_data.head())  # Print data after removing duplicates

# Identify numeric columns
numeric_columns = [
    col for col in total_data.select_dtypes(include=["float64", "int64"]).columns if col != "Heart disease_number"
]
print("\nNumeric Columns Identified:")
print(numeric_columns)

Initial Data Preview:
   fips  TOT_POP    0-9  0-9 y/o % of total pop  19-Oct  \
0  1001    55601   6787               12.206615    7637   
1  1003   218022  24757               11.355276   26913   
2  1005    24881   2732               10.980266    2960   
3  1007    22400   2456               10.964286    2596   
4  1009    57840   7095               12.266598    7570   

   10-19 y/o % of total pop  20-29  20-29 y/o % of total pop  30-39  \
0                 13.735364   6878                 12.370281   7089   
1                 12.344167  23579                 10.814964  25213   
2                 11.896628   3268                 13.134520   3201   
3                 11.589286   3029                 13.522321   3113   
4                 13.087828   6742                 11.656293   6884   

   30-39 y/o % of total pop  ...  COPD_number  diabetes_prevalence  \
0                 12.749771  ...         3644                 12.9   
1                 11.564429  ...        14692           

In [4]:
# Split data into train and test sets
X = total_data[numeric_columns]
y = total_data["Heart disease_number"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining and Testing Sets Created:")
print(f"Training Set Size: {len(X_train)}, Testing Set Size: {len(X_test)}")

In [5]:
# Scale data after split
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for compatibility
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("\nData After Scaling:")
print("Scaled Training Data (First 5 Rows):")
print(X_train.head())


Data After Scaling:
Scaled Training Data (First 5 Rows):
          fips   TOT_POP       0-9  0-9 y/o % of total pop    19-Oct  \
1292 -0.301633 -0.229763 -0.225393                0.102383 -0.231350   
2302  0.761573 -0.161280 -0.179851               -0.754597 -0.181109   
761  -0.833037 -0.198764 -0.209983               -0.717144 -0.195009   
2194  0.629287 -0.048115 -0.049041                0.120407 -0.029705   
1241 -0.308413  0.070012  0.073864                0.252809  0.080526   

      10-19 y/o % of total pop     20-29  20-29 y/o % of total pop     30-39  \
1292                  0.162374 -0.229775                 -0.429454 -0.223780   
2302                 -0.836073 -0.188375                 -0.736296 -0.176225   
761                   0.464170 -0.168470                  1.320194 -0.194740   
2194                  0.576280 -0.067671                  0.094875 -0.062335   
1241                  0.234535  0.038603                  0.173362  0.025430   

      30-39 y/o % of total p

In [6]:
# Feature selection using SelectKBest
k = int(len(X_train.columns) * 0.3)  # Selecting top 30% features
selection_model = SelectKBest(score_func=f_regression, k=k)
X_train_sel = selection_model.fit_transform(X_train, y_train)
X_test_sel = selection_model.transform(X_test)

# Retrieve selected feature names
selected_features = X_train.columns[selection_model.get_support()]
X_train_sel = pd.DataFrame(X_train_sel, columns=selected_features, index=X_train.index)
X_test_sel = pd.DataFrame(X_test_sel, columns=selected_features, index=X_test.index)

print("\nSelected Features (Top 30%):")
print(selected_features)


Selected Features (Top 30%):
Index(['TOT_POP', '0-9', '19-Oct', '20-29', '30-39', '40-49', '50-59', '60-69',
       '70-79', '80+', 'White-alone pop', 'POP_ESTIMATE_2018',
       'High school diploma only 2014-18',
       'Some college or associate's degree 2014-18',
       'Bachelor's degree or higher 2014-18', 'POVALL_2018',
       'Civilian_labor_force_2018', 'Employed_2018', 'Unemployed_2018',
       'Total nurse practitioners (2019)', 'Total physician assistants (2019)',
       'Family Medicine/General Practice Primary Care (2019)',
       'Total Specialist Physicians (2019)', 'Total Population',
       'Population Aged 60+', 'county_pop2018_18 and older',
       'anycondition_number', 'Obesity_number', 'COPD_number',
       'diabetes_number', 'CKD_number'],
      dtype='object')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load data
total_data = pd.read_csv(
    "https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv", sep=","
)
print("Initial Data Preview:")
print(total_data.head())  # Print initial data preview

# Drop duplicates
total_data = total_data.drop_duplicates().reset_index(drop=True)
print("\nData After Removing Duplicates:")
print(total_data.head())  # Print data after removing duplicates

# Identify numeric columns
numeric_columns = [
    col for col in total_data.select_dtypes(include=["float64", "int64"]).columns if col != "Heart disease_number"
]
print("\nNumeric Columns Identified:")
print(numeric_columns)
Split Data (Cell 2)
python
Copy code
# Split data into train and test sets
X = total_data[numeric_columns]
y = total_data["Heart disease_number"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining and Testing Sets Created:")
print(f"Training Set Size: {len(X_train)}, Testing Set Size: {len(X_test)}")
Scale Data (Cell 3)
python
Copy code
# Scale data after split
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for compatibility
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("\nData After Scaling:")
print("Scaled Training Data (First 5 Rows):")
print(X_train.head())
Feature Selection (Cell 4)
python
Copy code
# Feature selection using SelectKBest
k = int(len(X_train.columns) * 0.3)  # Selecting top 30% features
selection_model = SelectKBest(score_func=f_regression, k=k)
X_train_sel = selection_model.fit_transform(X_train, y_train)
X_test_sel = selection_model.transform(X_test)

# Retrieve selected feature names
selected_features = X_train.columns[selection_model.get_support()]
X_train_sel = pd.DataFrame(X_train_sel, columns=selected_features, index=X_train.index)
X_test_sel = pd.DataFrame(X_test_sel, columns=selected_features, index=X_test.index)

print("\nSelected Features (Top 30%):")
print(selected_features)
Save Clean Data (Cell 5)
python
Copy code
# Add target back to selected features
X_train_sel["Heart disease_number"] = y_train.values
X_test_sel["Heart disease_number"] = y_test.values

# Save clean data
X_train_sel.to_csv("clean_train.csv", index=False)
X_test_sel.to_csv("clean_test.csv", index=False)

print("\nClean Data Saved as CSV:")
print("Clean Training Data (First 5 Rows):")
print(X_train_sel.head())
print("\nClean Testing Data (First 5 Rows):")
print(X_test_sel.head())

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score
# Reload cleaned datasets
train_data = pd.read_csv("clean_train.csv")
test_data = pd.read_csv("clean_test.csv")

# Split into features and target
X_train = train_data.drop(columns=["Heart disease_number"])
y_train = train_data["Heart disease_number"]
X_test = test_data.drop(columns=["Heart disease_number"])
y_test = test_data["Heart disease_number"]

# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

print("\nLogistic Regression Results:")
print(f"Intercept: {logistic_model.intercept_[0]}")
print(f"Coefficients: {logistic_model.coef_}")
y_pred = logistic_model.predict(X_test)
print(f"Predicted Values: {y_pred[:10]}")  # Print first 10 predictions
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")

# Lasso Regression
lasso_model = Lasso(alpha=1.0, max_iter=10000)
lasso_model.fit(X_train, y_train)

print("\nLasso Regression Results:")
print(f"Coefficients: {lasso_model.coef_}")
print(f"R2 Score: {lasso_model.score(X_test, y_test):.4f}")


Logistic Regression Results:
Intercept: -0.3684868274176662
Coefficients: [[-0.09014269 -0.08561096 -0.091637   ... -0.13421979 -0.09989719
  -0.10097791]
 [-0.09031858 -0.08627351 -0.0922898  ... -0.13425841 -0.09981004
  -0.10069553]
 [-0.08983689 -0.08546314 -0.09149974 ... -0.13381272 -0.09949631
  -0.10053073]
 ...
 [ 0.25987718  0.31372848  0.36919197 ...  0.36268399 -0.02134901
   0.30660399]
 [ 0.23608456  0.20329934  0.16767819 ...  0.356965    0.16344762
   0.29957922]
 [ 0.28313754  0.20553798  0.22332863 ...  0.13754288  0.42226355
   0.30486028]]
Predicted Values: [1072 8689 1072 8689 8689 1072 1072 1072 1072 1072]
MSE: 9905227.1242
R2 Score: 0.8868

Lasso Regression Results:
Coefficients: [ 0.00000000e+00  3.19887899e+03 -8.14635860e+02  0.00000000e+00
 -1.86039880e+02  0.00000000e+00  1.53231720e+03  0.00000000e+00
  2.21917415e+03  7.41749743e+02  1.74495589e+03  0.00000000e+00
  1.33193851e+03  0.00000000e+00  1.81732677e+03  1.11131522e+03
  0.00000000e+00  6.8203057