# **Homework 3**

In [43]:
!pip install pyspark
# !pip install --upgrade pyspark
# !pip install pyspark
!pip install scrapy



In [44]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, mean, median, udf
from pyspark.sql.functions import col, when, lit, regexp_replace
from pyspark.sql.types import FloatType, StringType, StructType, StructField

from pyspark.sql import functions as F

from scrapy import Selector
import requests

In [45]:
# Initialize Spark session
spark = SparkSession.builder.appName("HeartDiseasePrediction").getOrCreate()

In [46]:
# Load data from S3
# data_path = "s3a://your-bucket-name/your-dataset.csv"
data_path = "heart_disease.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)
df.show()

+---+---+-------+--------+-------+-------+---+--------+---+----+-----+----+-----+---+----+-------+-------+-----+----------+-----+---+----+----+---+--------+-----+-------+--------+----+-------+--------+--------+--------+-----+--------+-----+-----+-------+-----+-----+------+---+-------+-------+------+------+------+------+----+-------+-------+-------+---+----+---+------+
|age|sex|painloc|painexer|relrest|pncaden| cp|trestbps|htn|chol|smoke|cigs|years|fbs|  dm|famhist|restecg|ekgmo|ekgday(day|ekgyr|dig|prop|nitr|pro|diuretic|proto|thaldur|thaltime| met|thalach|thalrest|tpeakbps|tpeakbpd|dummy|trestbpd|exang|xhypo|oldpeak|slope|rldv5|rldv5e| ca|restckm|exerckm|restef|restwm|exeref|exerwm|thal|thalsev|thalpul|earlobe|cmo|cday|cyr|target|
+---+---+-------+--------+-------+-------+---+--------+---+----+-----+----+-----+---+----+-------+-------+-----+----------+-----+---+----+----+---+--------+-----+-------+--------+----+-------+--------+--------+--------+-----+--------+-----+-----+-------+----

In [47]:
# choose a subset of the data with only the given columns in the assigment
col_subset = [["age", "sex", "painloc", "painexer", "cp", "trestbps", "smoke", "fbs", "prop", "nitr",
         "pro", "diuretic", "thaldur", "thalach", "exang", "oldpeak", "slope", "target"]]

df = df.select(*col_subset)
df.show()

+---+---+-------+--------+---+--------+-----+---+----+----+---+--------+-------+-------+-----+-------+-----+------+
|age|sex|painloc|painexer| cp|trestbps|smoke|fbs|prop|nitr|pro|diuretic|thaldur|thalach|exang|oldpeak|slope|target|
+---+---+-------+--------+---+--------+-----+---+----+----+---+--------+-------+-------+-----+-------+-----+------+
| 63|  1|   NULL|    NULL|  1|     145| NULL|  1|   0|   0|  0|       0|   10.5|    150|    0|    2.3|    3|     0|
| 67|  1|   NULL|    NULL|  4|     160| NULL|  0|   1|   0|  0|       0|    9.5|    108|    1|    1.5|    2|     1|
| 67|  1|   NULL|    NULL|  4|     120| NULL|  0|   1|   0|  0|       0|    8.5|    129|    1|    2.6|    2|     1|
| 37|  1|   NULL|    NULL|  3|     130| NULL|  0|   1|   0|  0|       0|   13.0|    187|    0|    3.5|    3|     0|
| 41|  0|   NULL|    NULL|  2|     130| NULL|  0|   0|   0|  0|       0|    7.0|    172|    0|    1.4|    1|     0|
| 56|  1|   NULL|    NULL|  2|     120| NULL|  0|   0|   0|  0|       0|

# Data Cleaning

In [48]:
## data cleaning

def calculate_mode(df, column):
    mode_df = df.groupBy(column).count().orderBy('count', ascending=False).first()
    return mode_df[0] if mode_df else None


def clean_data(df):
    # Convert age to numeric and replace invalid values
    df = df.withColumn('age', col('age').cast('float'))
    median_age = df.filter(df['age'] > 0).approxQuantile('age', [0.5], 0.0)[0]
    df = df.withColumn('age', when(col('age') <= 0, median_age).otherwise(col('age')))

    # Replace missing values in painloc and painexer with mode
    mode_painloc = calculate_mode(df, 'painloc')
    if mode_painloc is not None:
        df = df.fillna({"painloc": mode_painloc})

    # mode_painexer = calculate_mode(df, 'painexer')
    # if mode_painexer is not None:
    #     df = df.fillna({"painexer": mode_painexer})
    mean_painexer = df.select(mean("painexer")).withColumnRenamed("avg(painexer)", "painexer").collect()[0][0]
    df = df.fillna({"painexer": mean_painexer})

    # Replace values less than 100 in trestbps with 100
    df = df.withColumn("trestbps", when(col("trestbps") < 100, 100).otherwise(col("trestbps")))

    # Replace invalid values in oldpeak
    df = df.withColumn("oldpeak", when(col("oldpeak") < 0, 0).otherwise(col("oldpeak")))
    df = df.withColumn("oldpeak", when(col("oldpeak") > 4, 4).otherwise(col("oldpeak")))

    # Replace missing values in thaldur and thalach with mean
    mean_thaldur = df.select(mean("thaldur")).collect()[0][0]
    mean_thalach = df.select(mean("thalach")).collect()[0][0]
    df = df.fillna({"thaldur": mean_thaldur, "thalach": mean_thalach})

    # Replace missing values and values greater than 1 in fbs, prop, nitr, pro, diuretic with mode
    for column in ['fbs', 'prop', 'nitr', 'pro', 'diuretic']:
        mode_value = calculate_mode(df, column)
        if mode_value is not None:
            df = df.fillna({column: mode_value})
            df = df.withColumn(column, when(col(column) > 1, mode_value).otherwise(col(column)))

    # Replace missing values in exang and slope with mode
    for column in ['exang', 'slope']:
        mode_value = calculate_mode(df, column)
        if mode_value is not None:
            df = df.withColumn(column, when(col(column).isNull(), lit(mode_value)).otherwise(col(column)))

    # Replace missing values in target with mode
    mode_target = calculate_mode(df, 'target')
    if mode_target is not None:
        df = df.withColumn('target', when(col('target').isNull(), lit(mode_target)).otherwise(col('target')))

    return df

# Apply the cleaning function
df_cleaned = clean_data(df)
df_cleaned.show()

# Save the modified DataFrame to a new CSV file
df_cleaned.write.mode("overwrite").csv("heart_disease_cleaned.csv", header=True)

+----+---+-------+--------+---+--------+-----+---+----+----+---+--------+-------+-------+-----+-------+-----+------+
| age|sex|painloc|painexer| cp|trestbps|smoke|fbs|prop|nitr|pro|diuretic|thaldur|thalach|exang|oldpeak|slope|target|
+----+---+-------+--------+---+--------+-----+---+----+----+---+--------+-------+-------+-----+-------+-----+------+
|63.0|  1|      1|       0|  1|     145| NULL|  1|   0|   0|  0|       0|   10.5|    150|    0|    2.3|    3|     0|
|67.0|  1|      1|       0|  4|     160| NULL|  0|   1|   0|  0|       0|    9.5|    108|    1|    1.5|    2|     1|
|67.0|  1|      1|       0|  4|     120| NULL|  0|   1|   0|  0|       0|    8.5|    129|    1|    2.6|    2|     1|
|37.0|  1|      1|       0|  3|     130| NULL|  0|   1|   0|  0|       0|   13.0|    187|    0|    3.5|    3|     0|
|41.0|  0|      1|       0|  2|     130| NULL|  0|   0|   0|  0|       0|    7.0|    172|    0|    1.4|    1|     0|
|56.0|  1|      1|       0|  2|     120| NULL|  0|   0|   0|  0|

# Web Scraping

### Source #1

In [49]:

# URL of the page
url = 'https://www.abs.gov.au/statistics/health/health-conditions-and-risks/smoking-and-vaping/latest-release'

# Make a request to fetch the webpage
response = requests.get(url)
response.raise_for_status()

# html content
html_content = response.content
# html_content

full_sel = Selector(text=html_content)
tables = full_sel.xpath('//table[@class="responsive-enabled"]')

rows = tables[1].xpath('.//thead//tr | .//tbody//tr')

from typing import List
import re


# parse HTML rows and extract data from each cell
def parse_row(row: Selector) -> List[str]:
    """
    Parses an HTML row into a list of individual elements.
    """
    cells = row.xpath('.//th | .//td')
    row_data = []

    for cell in cells:
        cell_text = cell.xpath('.//text()').getall()
        cell_text = ' '.join(cell_text)  # Join all text nodes within the cell
        cell_text = cell_text.strip()  # Strip whitespace from the beginning and end
        cell_text = re.sub(r'\s+', ' ', cell_text)  # Replace multiple spaces with a single space
        cell_text = re.sub(r'<.*?>', ' ', cell_text)  # Remove remaining HTML tags
        cell_text = cell_text.replace('\xa0', ' ')  # Replace non-breaking space with regular space
        row_data.append(cell_text)

    return row_data

header = parse_row(rows[0])
table_data = [parse_row(row) for row in rows[1:]]

# Create a Spark DataFrame
smoking1_df = spark.createDataFrame(table_data, schema=header)
smoking1_df.show()

+-----------------+-----------+---------------------------------+----------------------------------+-----------+---------------------------------+----------------------------------+-----------+---------------------------------+----------------------------------+--------+---------------------------------+----------------------------------+
|                 |2011–12 (%)|95% confidence interval (%) (low)|95% confidence interval (%) (high)|2014–15 (%)|95% confidence interval (%) (low)|95% confidence interval (%) (high)|2017–18 (%)|95% confidence interval (%) (low)|95% confidence interval (%) (high)|2022 (%)|95% confidence interval (%) (low)|95% confidence interval (%) (high)|
+-----------------+-----------+---------------------------------+----------------------------------+-----------+---------------------------------+----------------------------------+-----------+---------------------------------+----------------------------------+--------+---------------------------------+-------------

### Source #2

In [50]:

# URL of the page
url = 'https://www.cdc.gov/tobacco/data_statistics/fact_sheets/adult_data/cig_smoking/index.htm'

# Make a request to fetch the webpage
response = requests.get(url)
response.raise_for_status()

# html content
html_content = response.content

text = Selector(text=html_content)
div_list = text.xpath('//div[contains(@class, "card-body")]')

div_sex = text.xpath('//div[contains(@class, "card-body") and .//p[contains(text(), "Current cigarette smoking was higher among men than women")]]/ul')
by_sex_list = div_sex.xpath('.//li/text()')
by_sex_list.getall()

# print("---")

div_age = text.xpath('//div[contains(@class, "card-body") and .//p[contains(text(), "Current cigarette smoking was highest among people aged 25–44 years and 45–64 years. Current cigarette smoking was lowest among people aged 18-24 years")]]/ul')
by_age_list = div_age.xpath('.//li/text()')
by_age_list.getall()



['About 5 of every 100 adults aged 18–24 years (5.3%)',
 'Nearly 13 of every 100 adults aged 25–44 years (12.6%)',
 'Nearly 15 of every 100 adults aged 45–64 years (14.9%)',
 'About 8 of every 100 adults aged 65 years and older (8.3%)']

## fixing the "Smoke" column


In [51]:
# Mocked data for smoking rates
def impute_smoke(df):
    # Mocked data for smoking rates
    smoking_rates_by_age = {
        '18-24': 0.053,
        '25-44': 0.126,
        '45-64': 0.149,
        '65+': 0.083
    }
    smoking_rates_by_sex = {'Female': 0.101, 'Male': 0.131}

    age_bounds = {
        '15-17': (15, 17),
        '18-24': (18, 24),
        '25-34': (25, 34),
        '35-44': (35, 44),
        '45-54': (45, 54),
        '55-64': (55, 64),
        '65+': (65, 120)  # Assuming 65+ means up to very old ages
    }

    # Create UDFs for imputing smoke_source1 and smoke_source2
    def get_smoke_source1(age):
        age = float(age)
        for key, value in age_bounds.items():
            if value[0] <= age <= value[1]:
                return smoking_rates_by_age.get(key, None)
        return None

    def get_smoke_source2(age, sex):
        age = float(age)
        for key, value in age_bounds.items():
            if value[0] <= age <= value[1]:
                age_rate = smoking_rates_by_age.get(key, None)
                if age_rate is not None:
                    sex_rate = smoking_rates_by_sex['Male' if sex == 1.0 else 'Female']
                    rate = age_rate * (sex_rate / smoking_rates_by_sex['Female']) if sex == 1.0 else age_rate
                    return rate
        return None

    get_smoke_source1_udf = udf(get_smoke_source1, FloatType())
    get_smoke_source2_udf = udf(get_smoke_source2, FloatType())

    # Ensure the 'age' column is cast to float
    df = df.withColumn("age", col("age").cast("float"))

    # Apply the UDFs to add new columns
    df = df.withColumn("smoke_source1", get_smoke_source1_udf(col("age")))
    df = df.withColumn("smoke_source2", get_smoke_source2_udf(col("age"), col("sex")))

    return df


# Apply the impute_smoke function
df_imputed = impute_smoke(df_cleaned)

# Show the result
df_imputed.show()

# Save the modified DataFrame to a new CSV file
df_cleaned.write.mode("overwrite").csv("heart_disease_cleaned.csv", header=True)

+----+---+-------+--------+---+--------+-----+---+----+----+---+--------+-------+-------+-----+-------+-----+------+-------------+-------------+
| age|sex|painloc|painexer| cp|trestbps|smoke|fbs|prop|nitr|pro|diuretic|thaldur|thalach|exang|oldpeak|slope|target|smoke_source1|smoke_source2|
+----+---+-------+--------+---+--------+-----+---+----+----+---+--------+-------+-------+-----+-------+-----+------+-------------+-------------+
|63.0|  1|      1|       0|  1|     145| NULL|  1|   0|   0|  0|       0|   10.5|    150|    0|    2.3|    3|     0|         NULL|         NULL|
|67.0|  1|      1|       0|  4|     160| NULL|  0|   1|   0|  0|       0|    9.5|    108|    1|    1.5|    2|     1|        0.083|   0.10765347|
|67.0|  1|      1|       0|  4|     120| NULL|  0|   1|   0|  0|       0|    8.5|    129|    1|    2.6|    2|     1|        0.083|   0.10765347|
|37.0|  1|      1|       0|  3|     130| NULL|  0|   1|   0|  0|       0|   13.0|    187|    0|    3.5|    3|     0|         NULL|

In [52]:

def categorize_percent1(age: int) -> float:
    if age is None:
        return 0.0
    elif 15 <= age <= 17:
        return .016
    elif 18 <= age <= 24:
        return .073
    elif 25 <= age <= 34:
        return .109
    elif 35 <= age <= 44:
        return .109
    elif 45 <= age <= 54:
        return .138
    elif 55 <= age <= 64:
        return .149
    elif 65 <= age <= 74:
        return .087
    elif age >= 75:
        return .029
    else:
        return 0.0

get_smoking_udf = udf(lambda age: categorize_percent1(age), FloatType())
df_source1 = df_imputed.withColumn('age', col('age').cast('int'))
df_source1 = df_source1.filter(col('age').isNotNull())
df_source1 = df_source1.withColumn('smoke_source1', when(col('smoke').isin([0, 1]), col('smoke').cast(FloatType())).otherwise(get_smoking_udf(col('age'))))
# df_source1.show()

#source 2
def categorize_percent2(age: int, sex: int) -> float:
    if sex == 0:  # Female
        if 18 <= age <= 24:
            return .053
        elif 25 <= age <= 44:
            return .126
        elif 45 <= age <= 64:
            return .149
        elif age >= 65:
            return .083
    elif sex == 1:  # Male
        if 18 <= age <= 24:
            return round(.053 * (.131 / .101), 3)
        elif 25 <= age <= 44:
            return round(.126 * (.131 / .101), 3)
        elif 45 <= age <= 64:
            return round(.149 * (.131 / .101), 3)
        elif age >= 65:
            return round(.083 * (.131 / .101), 3)
    return None

get_smoking_udf = udf(lambda age, sex: categorize_percent2(age, sex), FloatType())
df_new = df_source1.withColumn('age', col('age').cast('int'))
df_new = df_new.withColumn('sex', col('sex').cast('int'))
df_new = df_new.withColumn('smoke_source2',
                                 when(col('smoke').isin([0, 1]), col('smoke').cast(FloatType()))
                                 .otherwise(get_smoking_udf(col('age'), col('sex'))))
df_new = df_new.drop('smoke')
# save the modified DataFrame
df_new.write.mode("overwrite").csv("heart_disease_cleaned.csv", header=True)
df_new.show()

Exception ignored in: <function JavaWrapper.__del__ at 0x7b143e203370>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x7b143e203370>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x7b143e203370>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has

+---+---+-------+--------+---+--------+---+----+----+---+--------+-------+-------+-----+-------+-----+------+-------------+-------------+
|age|sex|painloc|painexer| cp|trestbps|fbs|prop|nitr|pro|diuretic|thaldur|thalach|exang|oldpeak|slope|target|smoke_source1|smoke_source2|
+---+---+-------+--------+---+--------+---+----+----+---+--------+-------+-------+-----+-------+-----+------+-------------+-------------+
| 63|  1|      1|       0|  1|     145|  1|   0|   0|  0|       0|   10.5|    150|    0|    2.3|    3|     0|        0.149|        0.193|
| 67|  1|      1|       0|  4|     160|  0|   1|   0|  0|       0|    9.5|    108|    1|    1.5|    2|     1|        0.087|        0.108|
| 67|  1|      1|       0|  4|     120|  0|   1|   0|  0|       0|    8.5|    129|    1|    2.6|    2|     1|        0.087|        0.108|
| 37|  1|      1|       0|  3|     130|  0|   1|   0|  0|       0|   13.0|    187|    0|    3.5|    3|     0|        0.109|        0.163|
| 41|  0|      1|       0|  2|    

# Training a heart disease prediction model on the data

In [53]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

# Initialize Spark session
# spark = SparkSession.builder.appName("HeartDiseasePrediction").getOrCreate()

path = "heart_disease_cleaned.csv"
df_new = spark.read.csv(path, header=True, inferSchema=True)

# Add an index column to use for stratified sampling
df_new = df_new.withColumn("index", F.monotonically_increasing_id())

# Compute the proportion of positive labels
label_counts = df_new.groupBy('target').count().collect()
positive_ratio = label_counts[1]['count'] / (label_counts[0]['count'] + label_counts[1]['count'])

# Split the data into training and test sets with stratification
train_df = df_new.sampleBy("target", fractions={0: 0.9, 1: 0.9}, seed=42)
test_df = df_new.subtract(train_df)

train_df.show(5)
test_df.show(5)

+---+---+-------+--------+---+--------+---+----+----+---+--------+-------+-------+-----+-------+-----+------+-------------+-------------+-----+
|age|sex|painloc|painexer| cp|trestbps|fbs|prop|nitr|pro|diuretic|thaldur|thalach|exang|oldpeak|slope|target|smoke_source1|smoke_source2|index|
+---+---+-------+--------+---+--------+---+----+----+---+--------+-------+-------+-----+-------+-----+------+-------------+-------------+-----+
| 63|  1|      1|       0|  1|     145|  1|   0|   0|  0|       0|   10.5|    150|    0|    2.3|    3|     0|        0.149|        0.193|    0|
| 67|  1|      1|       0|  4|     160|  0|   1|   0|  0|       0|    9.5|    108|    1|    1.5|    2|     1|        0.087|        0.108|    1|
| 67|  1|      1|       0|  4|     120|  0|   1|   0|  0|       0|    8.5|    129|    1|    2.6|    2|     1|        0.087|        0.108|    2|
| 37|  1|      1|       0|  3|     130|  0|   1|   0|  0|       0|   13.0|    187|    0|    3.5|    3|     0|        0.109|        0.163

In [57]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, monotonically_increasing_id
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Initialize Spark session
spark = SparkSession.builder.appName("HeartDiseasePrediction").getOrCreate()

# # Load data
path = "heart_disease_cleaned.csv"
df = spark.read.csv(path, header=True, inferSchema=True)

#  Print the schema of the DataFrame
df.printSchema()

# Assemble feature columns into a single vector column
feature_cols = [col for col in df_new.columns if col not in ['target', 'index']]
print(feature_cols)


# Logistic Regression
assembler_lr = VectorAssembler(inputCols=feature_cols, outputCol="features_lr")
train_df_lr = assembler_lr.transform(train_df)
test_df_lr = assembler_lr.transform(test_df)

# Random Forest
assembler_rf = VectorAssembler(inputCols=feature_cols, outputCol="features_rf")
train_df_rf = assembler_rf.transform(train_df)
test_df_rf = assembler_rf.transform(test_df)

# Gradient-Boosted Trees
assembler_gb = VectorAssembler(inputCols=feature_cols, outputCol="features_gb")
train_df_gb = assembler_gb.transform(train_df)
test_df_gb = assembler_gb.transform(test_df)

# Define the models
model_lr = LogisticRegression(featuresCol='features_lr', labelCol='target')
model_rf = RandomForestClassifier(featuresCol='features_rf', labelCol='target')
model_gb = GBTClassifier(featuresCol='features_gb', labelCol='target')

# Define parameter grids
param_grid_lr = (ParamGridBuilder()
                 .addGrid(model_lr.regParam, [0.01, 0.1, 1])
                 .addGrid(model_lr.elasticNetParam, [0.0, 0.5, 1.0])
                 .build())

param_grid_rf = (ParamGridBuilder()
                 .addGrid(model_rf.numTrees, [50, 100, 200])
                 .addGrid(model_rf.maxDepth, [5, 10, 20])
                 .build())

param_grid_gb = (ParamGridBuilder()
                 .addGrid(model_gb.maxIter, [50, 100])
                 .addGrid(model_gb.maxDepth, [3, 5])
                 .addGrid(model_gb.stepSize, [0.01, 0.1])
                 .build())

# Define evaluator
evaluator = BinaryClassificationEvaluator(labelCol='target', metricName='areaUnderROC')

# Setup CrossValidator for each model
crossval_lr = CrossValidator(estimator=model_lr, estimatorParamMaps=param_grid_lr, evaluator=evaluator, numFolds=5)
crossval_rf = CrossValidator(estimator=model_rf, estimatorParamMaps=param_grid_rf, evaluator=evaluator, numFolds=5)
crossval_gb = CrossValidator(estimator=model_gb, estimatorParamMaps=param_grid_gb, evaluator=evaluator, numFolds=5)

# Fit CrossValidator models
cv_model_lr = crossval_lr.fit(train_df_lr)
cv_model_rf = crossval_rf.fit(train_df_rf)
cv_model_gb = crossval_gb.fit(train_df_gb)

# Retrieve the best models
best_model_lr = cv_model_lr.bestModel
best_model_rf = cv_model_rf.bestModel
best_model_gb = cv_model_gb.bestModel

# Print best models' parameters
print(f"Best Logistic Regression Model: {best_model_lr}")
print(f"Best Random Forest Model: {best_model_rf}")
print(f"Best Gradient-Boosted Trees Model: {best_model_gb}")


root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- painloc: integer (nullable = true)
 |-- painexer: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- prop: integer (nullable = true)
 |-- nitr: integer (nullable = true)
 |-- pro: integer (nullable = true)
 |-- diuretic: integer (nullable = true)
 |-- thaldur: double (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- smoke_source1: double (nullable = true)
 |-- smoke_source2: double (nullable = true)

['age', 'sex', 'painloc', 'painexer', 'cp', 'trestbps', 'fbs', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thalach', 'exang', 'oldpeak', 'slope', 'smoke_source1', 'smoke_source2']


AttributeError: __provides__