<a href="https://colab.research.google.com/github/CaptainLight5/Coronary-heart-disease-prediction/blob/main/life_expectantcy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install PySpark
!pip install pyspark



In [44]:
import os

# Set JAVA_HOME environment variable
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] += ":/usr/lib/jvm/java-8-openjdk-amd64/bin"

In [45]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, GBTClassifier
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Initialize Spark session
spark = SparkSession.builder.appName("LifeExpectancyPrediction").getOrCreate()

# Load dataset
data = spark.read.csv("Life Expectancy Data.csv", header=True, inferSchema=True)
data.show(5)
data.printSchema()

+-----------+----+----------+----------------+---------------+-------------+-------+----------------------+-----------+--------+-----+------------------+-----+-----------------+-----------+---------+----------+-----------+---------------------+-------------------+-------------------------------+---------+
|    Country|Year|    Status|Life expectancy |Adult Mortality|infant deaths|Alcohol|percentage expenditure|Hepatitis B|Measles | BMI |under-five deaths |Polio|Total expenditure|Diphtheria | HIV/AIDS|       GDP| Population| thinness  1-19 years| thinness 5-9 years|Income composition of resources|Schooling|
+-----------+----+----------+----------------+---------------+-------------+-------+----------------------+-----------+--------+-----+------------------+-----+-----------------+-----------+---------+----------+-----------+---------------------+-------------------+-------------------------------+---------+
|Afghanistan|2015|Developing|            65.0|            263|           62|   

In [46]:
# Section 1: Data Preprocessing
# Drop rows with missing target values
data = data.na.drop(subset=["Life expectancy "])

# Impute missing values in other columns with median
numeric_cols = [c for c, t in data.dtypes if t in ['int', 'double']] # This filters for numeric columns
for col_name in numeric_cols: # this iterates and impute on numeric columns
    median_val = data.approxQuantile(col_name, [0.5], 0.0)[0]
    data = data.na.fill({col_name: median_val})

# Convert 'Status' to numerical
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="Status", outputCol="Status_Index")
data = indexer.fit(data).transform(data)

In [33]:
# Section 2: Feature Engineering
# Binarize Life Expectancy for Classification
# Binarize the target variable for classification
life_expectancy_threshold = 70  # here threshold was set for classification
data = data.withColumn("Life_Expectancy_Binary", when(col("Life expectancy ") >= life_expectancy_threshold, 1).otherwise(0))

# Vectorize Features
feature_columns = [c for c in data.columns if c not in ["Life expectancy", "Life_Expectancy_Binary", "Country", "Status"]]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features_unscaled")
data = assembler.transform(data)

# Standardize Features
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features")
scaler_model = scaler.fit(data)
data = scaler_model.transform(data)

# Feature Extraction: PCA
pca = PCA(k=10, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(data)
data = pca_model.transform(data)

In [34]:
# Section 3: Splitting the Data
train, test = data.randomSplit([0.8, 0.2], seed=42)


In [58]:
# Section 4: Classification Models
print("\n--- Classification Models ---")
classifiers = {
    "Logistic Regression": LogisticRegression(featuresCol="pca_features", labelCol="Life_Expectancy_Binary"),
    "Random Forest": RandomForestClassifier(featuresCol="pca_features", labelCol="Life_Expectancy_Binary"),
    "Decision Tree": DecisionTreeClassifier(featuresCol="pca_features", labelCol="Life_Expectancy_Binary"),
    "Gradient Boosting": GBTClassifier(featuresCol="pca_features", labelCol="Life_Expectancy_Binary")
}

classification_results = {}

for name, model in classifiers.items():
    print(f"\nTraining {name} (Classification)...")

    # Conditional parameter grid building
    if name in ["Random Forest", "Decision Tree", "Gradient Boosting"]:  # Apply to tree-based models only
        param_grid = ParamGridBuilder().addGrid(model.maxDepth, [5, 10]).addGrid(model.maxBins, [32, 64]).build()
    else:  # For Logistic Regression, use an empty parameter grid
        param_grid = ParamGridBuilder().build()

    crossval = CrossValidator(estimator=model, estimatorParamMaps=param_grid,
                              evaluator=BinaryClassificationEvaluator(labelCol="Life_Expectancy_Binary"),
                              numFolds=5)
    cv_model = crossval.fit(train)
    predictions = cv_model.transform(test)

    # Evaluate metrics
    evaluator = MulticlassClassificationEvaluator(labelCol="Life_Expectancy_Binary")
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    roc_auc = BinaryClassificationEvaluator(labelCol="Life_Expectancy_Binary", metricName="areaUnderROC").evaluate(predictions)

    # Collect results
    classification_results[name] = {
        "Model": cv_model,
        "Accuracy": accuracy,
        "Precision": precision,
        "F1 Score": f1,
        "ROC AUC": roc_auc
    }
    print(f"{name} - Accuracy: {accuracy}, Precision: {precision}, F1 Score: {f1}, ROC AUC: {roc_auc}")

# Print Classification Results Summary
print("\n--- Classification Results Summary ---")
for model_name, metrics in classification_results.items():
    print(f"{model_name}:")
    print(f"  Accuracy: {metrics['Accuracy']}")
    print(f"  Precision: {metrics['Precision']}")
    print(f"  F1 Score: {metrics['F1 Score']}")
    print(f"  ROC AUC: {metrics['ROC AUC']}")



--- Classification Models ---

Training Logistic Regression (Classification)...
Logistic Regression - Accuracy: 0.9364485981308411, Precision: 0.936864850533572, F1 Score: 0.9363251755636517, ROC AUC: 0.9815904292751584

Training Random Forest (Classification)...
Random Forest - Accuracy: 0.9289719626168225, Precision: 0.929179664580384, F1 Score: 0.9288657022509629, ROC AUC: 0.9844334975369455

Training Decision Tree (Classification)...
Decision Tree - Accuracy: 0.8953271028037383, Precision: 0.8953285295619007, F1 Score: 0.8952141719285583, ROC AUC: 0.9323926812104151

Training Gradient Boosting (Classification)...
Gradient Boosting - Accuracy: 0.9214953271028037, Precision: 0.9226318559365414, F1 Score: 0.9212241077191927, ROC AUC: 0.9781562280084446

--- Classification Results Summary ---
Logistic Regression:
  Accuracy: 0.9364485981308411
  Precision: 0.936864850533572
  F1 Score: 0.9363251755636517
  ROC AUC: 0.9815904292751584
Random Forest:
  Accuracy: 0.9289719626168225
  Pre

In [59]:
# Section 5: Regression Models
print("\n--- Regression Models ---")
regressors = {
    "Random Forest Regression": RandomForestRegressor(featuresCol="pca_features", labelCol="Life expectancy "),
    "Gradient Boosting Regression": GBTRegressor(featuresCol="pca_features", labelCol="Life expectancy ")
}

regression_results = {}

for name, model in regressors.items():
    print(f"\nTraining {name} (Regression)...")
    param_grid = ParamGridBuilder().addGrid(model.maxDepth, [5, 10]).addGrid(model.maxBins, [32, 64]).build()
    crossval = CrossValidator(estimator=model, estimatorParamMaps=param_grid,
                              evaluator=RegressionEvaluator(labelCol="Life expectancy "),
                              numFolds=5)
    cv_model = crossval.fit(train)
    predictions = cv_model.transform(test)

    # Evaluate metrics
    evaluator = RegressionEvaluator(labelCol="Life expectancy ")
    mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

    # Collect results
    regression_results[name] = {
        "Model": cv_model,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2
    }
    print(f"{name} - MSE: {mse}, RMSE: {rmse}, R2: {r2}")

# Print Regression Results Summary
print("\n--- Regression Results Summary ---")
for model_name, metrics in regression_results.items():
    print(f"{model_name}:")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  RMSE: {metrics['RMSE']}")
    print(f"  R²: {metrics['R2']}")


--- Regression Models ---

Training Random Forest Regression (Regression)...
Random Forest Regression - MSE: 7.656071741662913, RMSE: 2.766960740896573, R2: 0.9227878667243148

Training Gradient Boosting Regression (Regression)...
Gradient Boosting Regression - MSE: 11.057201076602523, RMSE: 3.32523699555423, R2: 0.8884871887319531

--- Regression Results Summary ---
Random Forest Regression:
  MSE: 7.656071741662913
  RMSE: 2.766960740896573
  R²: 0.9227878667243148
Gradient Boosting Regression:
  MSE: 11.057201076602523
  RMSE: 3.32523699555423
  R²: 0.8884871887319531
