In [3]:
import findspark
findspark.init()


In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NM_Taxcred") \
    .getOrCreate()

print("‚úÖ Spark session started")

‚úÖ Spark session started


In [9]:
import os
file_path = "NM_Taxcred.csv"

print("Files in this directory:")
print(os.listdir())

if os.path.exists(file_path):
    print(f"‚úÖ File found: {file_path}")
else:
    print(f"‚ùå File not found: {file_path}")

Files in this directory:
['.anaconda', '.android', '.conda', '.condarc', '.continuum', '.cursor', '.insomniac', '.ipynb_checkpoints', '.ipython', '.jupyter', '.popsql.json', 'anaconda3', 'anaconda_projects', 'ansel', 'AppData', 'Application Data', 'artifacts', 'Contacts', 'Cookies', 'CrossDevice', 'Documents', 'Downloads', 'Favorites', 'high_income_companies.csv', 'high_value_output.csv', 'high_value_output_1.csv', 'HP', 'IdeaProjects', 'Links', 'Local Settings', 'Music', 'My Documents', 'NetHood', 'NM_Taxcred.csv', 'NM_Taxcred.ipynb', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{a29fe300-904f-11ef-aec7-d6df598c01f7}.TM.blf', 'NTUSER.DAT{a29fe300-904f-11ef-aec7-d6df598c01f7}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{a29fe300-904f-11ef-aec7-d6df598c01f7}.TMContainer00000000000000000002.regtrans-ms', 'ntuser.ini', 'OneDrive', 'output', 'PrintHood', 'Project 1.ipynb', 'raw data 1.csv', 'Recent', 'Saved Games', 'Searches', 'SendTo', 'Start Menu', 'Templat

In [11]:
df = spark.read.csv("NM_Taxcred.csv", header=True, inferSchema=True)
print("‚úÖ csv loaded into Dataframe")

‚úÖ csv loaded into Dataframe


In [12]:
df.printSchema()
df.show(5, truncate=False)

root
 |-- the_geom: string (nullable = true)
 |-- OBJECTID: integer (nullable = true)
 |-- New Market Tax Credit: string (nullable = true)

+--------+--------+---------------------+
|the_geom|OBJECTID|New Market Tax Credit|
+--------+--------+---------------------+
|NULL    |30      |Yes                  |
|NULL    |27      |Yes                  |
|NULL    |29      |Yes                  |
|NULL    |15      |Yes                  |
|NULL    |75      |Yes                  |
+--------+--------+---------------------+
only showing top 5 rows


In [16]:
from pyspark.sql.functions import when

df_cleaned = df.withColumn("TaxCreditFlag",
                           when(df["New Market Tax Credit"] == "Yes", 1).otherwise(0)
                                )
df_cleaned.select("OBJECTID","TaxCreditFlag").show(5)

+--------+-------------+
|OBJECTID|TaxCreditFlag|
+--------+-------------+
|      30|            1|
|      27|            1|
|      29|            1|
|      15|            1|
|      75|            1|
+--------+-------------+
only showing top 5 rows


In [20]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["OBJECTID"],
               outputCol="features"
)

data = assembler.transform(df_cleaned)

In [22]:
final_data = data.withColumnRenamed("TaxCreditFlag", "label")

final_data.select("OBJECTID", "label", "features").show(5, truncate=False)



+--------+-----+--------+
|OBJECTID|label|features|
+--------+-----+--------+
|30      |1    |[30.0]  |
|27      |1    |[27.0]  |
|29      |1    |[29.0]  |
|15      |1    |[15.0]  |
|75      |1    |[75.0]  |
+--------+-----+--------+
only showing top 5 rows


In [24]:
train_df, test_df = final_data.randomSplit([0.8,0.2], seed=42)
print("‚úÖ Data split into training and test sets")

‚úÖ Data split into training and test sets


In [28]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(train_df)
print("‚úîÔ∏è Logistic Regression model trained")


‚úîÔ∏è Logistic Regression model trained


In [32]:
predictions = model.transform(test_df)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions)

print(f"üéØ AUC Score: {auc:.3f}")

predictions.select("label", "prediction","probability").show(5, truncate=False)

üéØ AUC Score: 1.000
+-----+----------+-----------+
|label|prediction|probability|
+-----+----------+-----------+
|1    |1.0       |[0.0,1.0]  |
|1    |1.0       |[0.0,1.0]  |
|1    |1.0       |[0.0,1.0]  |
|1    |1.0       |[0.0,1.0]  |
|1    |1.0       |[0.0,1.0]  |
+-----+----------+-----------+
only showing top 5 rows


In [33]:

export_data = predictions.select("OBJECTID", "label", "prediction", "probability")


export_pd = export_data.toPandas()


export_pd.to_csv("tax_credit_predictions.csv", index=False)

print("‚úî Predictions exported to tax_credit_predictions.csv")


‚úî Predictions exported to tax_credit_predictions.csv


In [34]:
export_pd.to_csv("tax_credit_predictions.csv", index=False)
