In [None]:
#importing the required libraries and initializing Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("DataProcessing").getOrCreate()

In [64]:
# Path to your .dat file
file_path = "data/dec17pub.dat"

# Read the .dat file as a DataFrame
raw_data = spark.read.text(file_path)

# Split the lines using a delimiter (modify as needed)
delimiter = "\t"  # Example delimiter
split_columns = split(raw_data.value, delimiter).alias("columns")

# Select and process the columns from the split data
processed_data = raw_data.select(split_columns)

# Show the processed data (you can perform further transformations)
processed_data.show()


+--------------------+
|             columns|
+--------------------+
|[0000047951107191...|
|[0000047951107191...|
|[0000716910049411...|
|[0000716910049411...|
|[0000716910049411...|
|[0001101779879861...|
|[0001101779879861...|
|[0001102065933811...|
|[0001102848156801...|
|[0001103278564691...|
|[0001103399354531...|
|[0001103399354531...|
|[0001103399354531...|
|[0001103399354531...|
|[0001103438685671...|
|[0001103438685671...|
|[0001103594243391...|
|[0001103594243391...|
|[0001104154004291...|
|[0001104154004291...|
+--------------------+
only showing top 20 rows



In [65]:
for value in processed_data:
    print(value)

Column<'columns'>


In [14]:
# Select the first row
first_row = processed_data.select(col("columns")).first()[0]
print(first_row)

['000004795110719122017 120100-1 1 1-1 1 9-1-1-1  14394041 1 2 1 7 2 0 206011 2  2-1-1-1-1 36 01 266200001103000   -1-1 1-1710 1 2 1 1-1 242 1-1 9-1 1-1 1 1 1 2 1 2 57 57 57 1 0 0 1 6 2-1-1 2-1-1-1-1 1-1-1-1-1-1-1-1-1-1-1-1-1 -1-1-1-1-1-1-1-1-1-1-1 -1-1-1   -1-1-1-1-1-1-1-1-1-1-1-1-1 -1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1 -1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1 2-1 0 4-1-1-1-1-1-1 -1-1-1 0 1 2-1-1-1-1-1-1-1-1      -1-1      -1-1-1 0-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1 0-1-1-1-1  -1  -1  -1  -10-1      -10-1-1      -1      -10-1-1-1-1-1-1-1-1 2-1-1-1-1  14394041  31009204         0  21825725  21562240 0 0 0-1-1-1 0 0 1 0-1 050 0 0 0 0 0 0 0 0-1-1-1 0 0 0 1-1-1-1-1-1-1 1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1 1 1 1 1 1 1 1 1 1 1 1-1-1-1-1-1-1-1-1-1-1-1 0 0-1-1-1-1-1-1 1 1 1-1-1-1   84 1-1-1 3-1-1-1 1 1 0-1-1-1  22089944  -1  -1  -1  -1-1-1 4 3-1-1 0-1-1-1-1-15050 1 1 1 2 2 2 1 2 2 1 0 0 0 0 0 0 0-1-1-1-1-1 2-1-120 0 0                                           

In [91]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import Row


# Create a Spark session
spark = SparkSession.builder.appName("TextToDataFrame").getOrCreate()

# Define the schema based on the updated column descriptions
schema = StructType([
    StructField("HRHHID", StringType(), False),
    StructField("HRMONTH", IntegerType(), False),
    StructField("HRYEAR4", IntegerType(), False),
    StructField("HUFINAL", StringType(), False),
    StructField("HETENURE", IntegerType(), False),
    StructField("HEHOUSUT", IntegerType(), False),
    StructField("HETELHHD", IntegerType(), False),
    StructField("HETELAVL", IntegerType(), False),
    StructField("HEPHONEO", IntegerType(), False),
    StructField("HEFAMINC", IntegerType(), False),
    StructField("HRHTYPE", IntegerType(), False),
    StructField("HUINTTYP", IntegerType(), False),
    StructField("PTDTRACE", StringType(), False)
    # Add more fields according to your actual schema
])

# Given text row
text_row = "000004795110719122017 120100-1 1 1-1 1 9-1-1-1  14394041 1 2 1 7 2 0 206011 ..." ""

# Split the text row into corresponding values based on positions defined in the schema
values = [
    text_row[0:15],  # HRHHID
    int(text_row[15:17]),  # HRMONTH
    int(text_row[17:21]),  # HRYEAR4
    text_row[23:26],  # HUFINAL
    int(text_row[28:30]),  # HETENURE
    int(text_row[30:32]),  # HEHOUSUT
    int(text_row[32:34]),  # HETELHHD
    int(text_row[34:36]),  # HETELAVL
    int(text_row[36:38]),  # HEPHONEO
    int(text_row[38:40]),   # HEFAMINC
    int(text_row[61:62]),   # HRHTYPE 
    int(text_row[65:66]),   #HUINTTYP 
    text_row[139:140]  #PTDTRACE

    # Add more value extractions based on your actual schema
]

# Column names
column_names = [
    "HRHHID", "MONTH", "YEAR", "HUFINAL", "HETENURE", "HEHOUSUT", "HETELHHD", 
    "HETELAVL","HEPHONEO", "FAMILY_INCOME", "HOUSEHOLD_TYPE",  "TYPE_OF_INTERVIEW",   
     "PTDTRACE"
]

# Add descriptions for HETELHHD
HOUSEHOLD_TELEPHONE = {
    1: "YES",
    2: "NO"
}


# Add descriptions for HRHTYPE
HOUSEHOLD_TYPE = {
    0: "NON-INTERVIEW HOUSEHOLD",
    1: "HUSBAND/WIFE PRIMARY FAMILY (NEITHER AF)",
    2: "HUSB/WIFE PRIM. FAMILY (EITHER/BOTH AF)",
    3: "UNMARRIED CIVILIAN MALE-PRIM. FAM HHLDER",
    4: "UNMARRIED CIV. FEMALE-PRIM FAM HHLDER",
    5: "PRIMARY FAMILY HHLDER-RP IN AF, UNMAR.",
    6: "CIVILIAN MALE PRIMARY INDIVIDUAL",
    7: "CIVILIAN FEMALE PRIMARY INDIVIDUAL",
    8: "PRIMARY INDIVIDUAL HHLD-RP IN AF",
    9: "GROUP QUARTERS WITH FAMILY",
    10: "GROUP QUARTERS WITHOUT FAMILY"
}

# Create a dictionary of column names and values
data_dict = {column_names[i]: values[i] for i in range(len(column_names))}
data_dict["HOUSEHOLD_TELEPHONE"] = HOUSEHOLD_TELEPHONE[data_dict["HETELHHD"]]
data_dict["HOUSEHOLD_TYPE"] = HOUSEHOLD_TYPE[data_dict["HOUSEHOLD_TYPE"]]  # Add HRHTYPE_DESCRIPTION based on HRHTYPE value

# Print the data_dict in a tabular form
for column, value in data_dict.items():
    print(f"{column}: {value}")
# Create a list of tuples from the data_dict dictionary
data_tuples = [(data_dict[col] for col in column_names)]

# Convert the list of tuples into a Spark DataFrame
data_df = spark.createDataFrame(data_tuples, column_names)
data_dict

print(data_dict)
# Convert the dictionary into a Spark DataFrame
row = Row(**data_dict)
data_df = spark.createDataFrame([row])
#print(row)
#print(data_df)








HRHHID: 000004795110719
MONTH: 12
YEAR: 2017
HUFINAL: 201
HETENURE: -1
HEHOUSUT: 1
HETELHHD: 1
HETELAVL: -1
HEPHONEO: 1
FAMILY_INCOME: 9
HOUSEHOLD_TYPE: HUSBAND/WIFE PRIMARY FAMILY (NEITHER AF)
TYPE_OF_INTERVIEW: 2
PTDTRACE: 
HOUSEHOLD_TELEPHONE: YES
{'HRHHID': '000004795110719', 'MONTH': 12, 'YEAR': 2017, 'HUFINAL': '201', 'HETENURE': -1, 'HEHOUSUT': 1, 'HETELHHD': 1, 'HETELAVL': -1, 'HEPHONEO': 1, 'FAMILY_INCOME': 9, 'HOUSEHOLD_TYPE': 'HUSBAND/WIFE PRIMARY FAMILY (NEITHER AF)', 'TYPE_OF_INTERVIEW': 2, 'PTDTRACE': '', 'HOUSEHOLD_TELEPHONE': 'YES'}


In [98]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import Row



# Define the schema based on the updated column descriptions
schema = StructType([
    StructField("HRHHID", StringType(), False),
    StructField("month", IntegerType(), False),
    StructField("year", IntegerType(), False),
    StructField("final_out", StringType(), False),
    StructField("hetenure", IntegerType(), False),
    StructField("hs_unt", IntegerType(), False),
    StructField("hs_tel", IntegerType(), False),
    StructField("else_tel", IntegerType(), False),
    StructField("tel_int", IntegerType(), False),
    StructField("fam_income", IntegerType(), False),
    StructField("hs_type", IntegerType(), False),
    StructField("int_type", IntegerType(), False),
    StructField("race", StringType(), False)
    # Add more fields according to your actual schema
])

# Define a UDF to extract values from text rows
def extract_values(text_row):
    values = [
        text_row[0][0:15],  # HRHHID
        int(text_row[0][15:17]),  # HRMONTH
        int(text_row[0][17:21]),  # HRYEAR4
        text_row[0][23:26],  # HUFINAL
        int(text_row[0][28:30]),  # HETENURE
        int(text_row[0][30:32]),  # HEHOUSUT
        int(text_row[0][32:34]),  # HETELHHD
        int(text_row[0][34:36]),  # HETELAVL
        int(text_row[0][36:38]),  # HEPHONEO
        int(text_row[0][38:40]),  # HEFAMINC
        int(text_row[0][61:62]),  # HRHTYPE 
        int(text_row[0][65:66]),  # HUINTTYP 
        text_row[0][139:140]  # PTDTRACE
        # Add more value extractions based on your actual schema
    ]
    return values

# Register the UDF
extract_udf = udf(extract_values, schema)
# Assuming your preprocessed_data DataFrame has a column named "text_row"
processed_data = processed_data.withColumn("extracted_values", extract_udf(col("columns")))

# Define column names
# Define column names
column_names = ["HRHHID", "month", "year", "final_out", "hetenure", "hs_unt",
                "hs_tel", "else_tel", "tel_int", "fam_income", "hs_type", "int_type",
                "race"]

# Select the extracted values and alias the columns
selected_df = processed_data.select(*[col("extracted_values")[col_name].alias(col_name) for col_name in column_names])
print(selected_df)
# Show the resulting DataFrame
selected_df.show(10)


DataFrame[HRHHID: string, month: int, year: int, final_out: string, hetenure: int, hs_unt: int, hs_tel: int, else_tel: int, tel_int: int, fam_income: int, hs_type: int, int_type: int, race: string]


Py4JJavaError: An error occurred while calling o15744.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 27.0 failed 1 times, most recent failure: Lost task 0.0 in stage 27.0 (TID 27) (DESKTOP-FPH57VM executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:82)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:131)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:853)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:853)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 25 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4177)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3382)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:284)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:323)
	at sun.reflect.GeneratedMethodAccessor97.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:82)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:131)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:853)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:853)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 25 more


In [99]:
import pyspark
print(pyspark.__version__)


3.4.1


In [None]:
SURVEY_FINAL_OUTCOME = { "001": "FULLY COMPLETE CATI INTERVIEW", "002": "PARTIALLY COMPLETED CATI INTERVIEW",
"003": "COMPLETE BUT PERSONAL VISIT REQUESTED NEXT MONTH"
    004	PARTIAL, NOT COMPLETE AT CLOSEOUT
    005	LABOR FORCE COMPLETE, SUPPLEMENT INCOMPLETE - CATI
    006	LF COMPLETE, SUPPLEMENT DK ITEMS INCOMPLETE AT 
        CLOSEOUT�ASEC ONLY
    020	HH OCCUPIED ENTIRELY BY ARMED FORCES MEMBERS
        OR ALL UNDER 15 YEARS OF AGE
    201	CAPI COMPLETE
    202	CALLBACK NEEDED
    203	SUFFICIENT PARTIAL - PRECLOSEOUT
    204	SUFFICIENT PARTIAL - AT CLOSEOUT
    205	LABOR FORCE COMPLETE, - SUPPL. INCOMPLETE - CAPI
    213	LANGUAGE BARRIER
    214	UNABLE TO LOCATE
    216	NO ONE HOME
    217	TEMPORARILY ABSENT
    218	REFUSED
    219	OTHER OCCUPIED - SPECIFY
    223	ENTIRE HOUSEHOLD ARMED FORCES
    224	ENTIRE HOUSEHOLD UNDER 15
    225	TEMP. OCCUPIED W/PERSONS WITH URE
    226	VACANT REGULAR
    227	VACANT - STORAGE OF HHLD FURNITURE
    228	UNFIT, TO BE DEMOLISHED
    229	UNDER CONSTRUCTION, NOT READY
    230	CONVERTED TO TEMP BUSINESS OR STORAGE
    231	UNOCCUPIED TENT OR TRAILER SITE
    232	PERMIT GRANTED - CONSTRUCTION NOT STARTED
    233	OTHER - SPECIFY
    240	DEMOLISHED
    241	HOUSE OR TRAILER MOVED
    242	OUTSIDE SEGMENT
    243	CONVERTED TO PERM. BUSINESS OR STORAGE
    244	MERGED
    245	CONDEMNED
    246	BUILT AFTER APRIL 1, 2000
    247	UNUSED SERIAL NO./LISTING SHEET LINE
    248	OTHER - SPECIFY
    256	REMOVED DURING SUB-SAMPLING
    257	UNIT ALREADY HAD A CHANCE OF SELECTION
}

In [33]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("DataFromDict").getOrCreate()

# Given text row
text_row = "000004795110719122017 120100-1 1 1-1 1 9-1-1-1  14394041 1 2 1 7 2 0 206011 ..."

# Split the text row into corresponding values
values = [
    text_row[0:15],  # HRHHID
    int(text_row[15:17]),  # HRMONTH
    int(text_row[17:21]),  # HRYEAR4
    int(text_row[21:23]),  # HURESPLI
    text_row[23:26],  # HUFINAL
    text_row[26:28],  # FILLER
    int(text_row[28:30]),  # HETENURE
    int(text_row[30:32]),  # HEHOUSUT
    int(text_row[32:34]),  # HETELHHD
    int(text_row[34:36]),  # HETELAVL
    int(text_row[36:38]),  # HEPHONEO
    int(text_row[38:40])   # HEFAMINC
]

# Column names
column_names = [
    "HRHHID", "HRMONTH", "HRYEAR4", "HURESPLI", "HUFINAL", "FILLER",
    "HETENURE", "HEHOUSUT", "HETELHHD", "HETELAVL", "HEPHONEO", "HEFAMINC"
]

# Create a DataFrame
data_df = spark.createDataFrame([values], column_names)

# Create a dictionary of column names and values
data_dict = {column_names[i]: values[i] for i in range(len(column_names))}
print(data_dict)

# Read the data dictionary text file
dict_file_path = "data/data_dictionary.txt"
print(dict_file_path)
data_dict = {}
with open(dict_file_path, "r", encoding="iso-8859-1") as file:
    for line in file:
        parts = line.strip().split(" ", 1)
        if len(parts) == 2:
            data_dict[parts[0]] = parts[1]
print(data_dict)
# Update column names of the DataFrame
for old_name, new_name in data_dict.items():
    data_df = data_df.withColumnRenamed(old_name, new_name)
    print(data_df)

# Show the DataFrame
#data_df.show()
print(data_df)



{'HRHHID': '000004795110719', 'HRMONTH': 12, 'HRYEAR4': 2017, 'HURESPLI': 1, 'HUFINAL': '201', 'FILLER': '00', 'HETENURE': -1, 'HEHOUSUT': 1, 'HETELHHD': 1, 'HETELAVL': -1, 'HEPHONEO': 1, 'HEFAMINC': 9}
data/data_dictionary.txt
{'ATTACHMENT': '6', 'CPS': 'RECORD LAYOUT FOR BASIC LABOR FORCE ITEMS', 'STANDARD': 'PUBLIC USE FILES', 'A1.': 'HOUSEHOLD INFORMATION', '*': 'STARTING JANUARY 2017 *', 'HRHHID\t\t\t15\t\tHOUSEHOLD': 'IDENTIFIER\t(Part 1)\t\t\t\t\t 1- 15', 'EDITED': 'UNIVERSE:  PECERT2 = 1', 'Part': '1 of this number is found in columns 1-15 of the record.', 'HRMONTH\t\t2\t\tMONTH': 'OF INTERVIEW\t\t\t\t\t\t\t16-17', 'VALID': 'ENTRIES', '01\tMIN': 'VALUE', '12\tMAX': 'VALUE', 'HRYEAR4\t\t4\t\tYEAR': 'OF INTERVIEW\t\t\t\t\t\t\t\t18-21', '1998\tMIN': 'VALUE', '2999\tMAX': 'VALUE', 'HURESPLI\t\t2\t\tLINE': 'NUMBER OF THE CURRENT\t\t\t\t\t22 - 23', '0\tMIN': 'VALUE', '99\tMAX': 'VALUE', 'HUFINAL\t\t3\t\tFINAL': 'OUTCOME CODE\t\t\t\t\t\t\t24 - 26', 'OUTCOME': 'CODES BETWEEN 001 AND 02

In [30]:
%config Application.log_level="DEBUG"

[IPKernelApp] {'header': {'msg_id': 'd0e01309-5e27bcfe2ab2a4290a477f74_11452_204', 'msg_type': 'execute_reply', 'username': 'username', 'session': 'd0e01309-5e27bcfe2ab2a4290a477f74', 'date': datetime.datetime(2023, 8, 24, 15, 28, 51, 758635, tzinfo=datetime.timezone.utc), 'version': '5.3'}, 'msg_id': 'd0e01309-5e27bcfe2ab2a4290a477f74_11452_204', 'msg_type': 'execute_reply', 'parent_header': {'date': datetime.datetime(2023, 8, 24, 15, 28, 51, 621000, tzinfo=tzutc()), 'msg_id': '21b6ae7e-9ae6-43d0-b8b3-08321c5f4b6c', 'msg_type': 'execute_request', 'session': '6a8b14d2-d60d-42bf-945b-378fbb10c2b6', 'username': 'bce64de9-10e7-4bb5-a100-22cf6c2fc4cc', 'version': '5.2'}, 'content': {'status': 'ok', 'execution_count': 30, 'user_expressions': {}, 'payload': []}, 'metadata': {'started': datetime.datetime(2023, 8, 24, 15, 28, 51, 624650, tzinfo=datetime.timezone.utc), 'dependencies_met': True, 'engine': '13dea78d-5036-4e55-a22f-e9a19c0e8cea', 'status': 'ok'}, 'tracker': <zmq.sugar.tracker.Mess

In [13]:
from pyspark.sql.functions import col

# Assuming the order of information in the array matches the list you provided
data_columns = ["Full household identifier", "Time of interview", "Final outcome of the survey",
                "Type of housing unit", "Household type", "Apartment/Household has a telephone",
                "Apartment/Household can access a telephone elsewhere", "Is telephone interview acceptable for the responder",
                "Type of interview", "Family income range", "Geographical division/location", "Race"]

# Select the "columns" array and extract each piece of information using col() and getItem()
selected_columns = []
for i, column_name in enumerate(data_columns):
    selected_columns.append(col("columns")[i].alias(column_name))

# Create a new DataFrame with the extracted information
extracted_data = processed_data.select(*selected_columns)

# Show the extracted data
extracted_data.show(truncate=False)


AssertionError: 