In [1]:
!pip install pyspark
!pip install datetime



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import MapType,StringType
from pyspark.sql.functions import from_json
import pandas as pd
import datetime

In [3]:
df = pd.read_excel("Intermediate_Task.xlsx")

In [4]:
print(df.dtypes)

Permit Number                              object
Permit Type                                 int64
Permit Type Definition                     object
Permit Creation Date                       object
Block                                      object
Lot                                        object
Street Number                               int64
Street Number Suffix                       object
Street Name                                object
Street Suffix                              object
Unit                                      float64
Unit Suffix                                object
Description                                object
Current Status                             object
Current Status Date                        object
Filed Date                                 object
Issued Date                                object
Completed Date                             object
First Construction Document Date           object
Structural Notification                    object


In [5]:
df.shape

(15364, 43)

In [6]:
df.head(5)

Unnamed: 0,Permit Number,Permit Type,Permit Type Definition,Permit Creation Date,Block,Lot,Street Number,Street Number Suffix,Street Name,Street Suffix,...,Existing Construction Type,Existing Construction Type Description,Proposed Construction Type,Proposed Construction Type Description,Site Permit,Supervisor District,Neighborhoods - Analysis Boundaries,Zipcode,Location,Record ID
0,201505000000.0,4,sign - erect,05/06/2015,326,23,140,,Ellis,St,...,3.0,constr type 3,,,,3.0,Tenderloin,94102.0,"(37.785719256680785, -122.40852313194863)",1380610000000.0
1,201604000000.0,4,sign - erect,04/19/2016,306,7,440,,Geary,St,...,3.0,constr type 3,,,,3.0,Tenderloin,94102.0,"(37.78733980600732, -122.41063199757738)",1420160000000.0
2,201605000000.0,3,additions alterations or repairs,05/27/2016,595,203,1647,,Pacific,Av,...,1.0,constr type 1,1.0,constr type 1,,3.0,Russian Hill,94109.0,"(37.7946573324287, -122.42232562979227)",1424860000000.0
3,201611000000.0,8,otc alterations permit,11/07/2016,156,11,1230,,Pacific,Av,...,5.0,wood frame (5),5.0,wood frame (5),,3.0,Nob Hill,94109.0,"(37.79595867909168, -122.41557405519474)",1443570000000.0
4,201611000000.0,6,demolitions,11/28/2016,342,1,950,,Market,St,...,3.0,constr type 3,,,,6.0,Tenderloin,94102.0,"(37.78315261897309, -122.40950883997789)",144548000000.0


In [7]:
df.isnull().sum()

Permit Number                                 0
Permit Type                                   0
Permit Type Definition                        0
Permit Creation Date                          0
Block                                         0
Lot                                           0
Street Number                                 0
Street Number Suffix                      15204
Street Name                                   0
Street Suffix                               194
Unit                                      13217
Unit Suffix                               15185
Description                                  25
Current Status                                0
Current Status Date                           0
Filed Date                                    0
Issued Date                                 635
Completed Date                             6235
First Construction Document Date            639
Structural Notification                   14896
Number of Existing Stories              

# **Drop column with missing values greater than 15000**

In [8]:
threshold = 13000

# Find columns with missing value rate higher than threshold
cols_to_drop = df.columns[df.isnull().sum() > threshold]
df2 = df.drop(columns=cols_to_drop)

In [9]:
df2.isnull().sum()

Permit Number                                0
Permit Type                                  0
Permit Type Definition                       0
Permit Creation Date                         0
Block                                        0
Lot                                          0
Street Number                                0
Street Name                                  0
Street Suffix                              194
Description                                 25
Current Status                               0
Current Status Date                          0
Filed Date                                   0
Issued Date                                635
Completed Date                            6235
First Construction Document Date           639
Number of Existing Stories                3189
Number of Proposed Stories                3386
Permit Expiration Date                    3460
Estimated Cost                            2887
Revised Cost                               247
Existing Use 

# **Fill street suffix with previous values**

In [10]:
df2['Street Suffix'] = df2['Street Suffix'].fillna(method='pad')

In [11]:
df2.isnull().sum()

Permit Number                                0
Permit Type                                  0
Permit Type Definition                       0
Permit Creation Date                         0
Block                                        0
Lot                                          0
Street Number                                0
Street Name                                  0
Street Suffix                                0
Description                                 25
Current Status                               0
Current Status Date                          0
Filed Date                                   0
Issued Date                                635
Completed Date                            6235
First Construction Document Date           639
Number of Existing Stories                3189
Number of Proposed Stories                3386
Permit Expiration Date                    3460
Estimated Cost                            2887
Revised Cost                               247
Existing Use 

In [12]:
df2['Description'] = df2['Description'].fillna('No description available for this product')

# **Fill the revised cost with the Mean**

In [13]:
df2["Revised Cost"].fillna( df2["Revised Cost"].mean(), inplace=True)

In [14]:
df2["Estimated Cost"].fillna( df2["Estimated Cost"].mean(), inplace=True)

In [15]:
df2.shape

(15364, 35)

# **Fill location, zipcode, Neighborhoods - Analysis Boundaries , Supervisor District  with the values after them**

In [16]:
columns_to_fill = ["Location", "Zipcode", "Neighborhoods - Analysis Boundaries", "Supervisor District"]

for column in columns_to_fill:
    df2[column] = df2[column].bfill()

# **Fill contruction type and description with values**

In [17]:
fill_values = {
    'Existing Construction Type': 0 ,
    'Existing Construction Type Description': 'No Description for this construction type',
    'Proposed Construction Type': 0,
    'Proposed Construction Type Description': 'No Description for this construction type',
}

df2.fillna(fill_values, inplace=True)

In [18]:
df2.isnull().sum()

Permit Number                                0
Permit Type                                  0
Permit Type Definition                       0
Permit Creation Date                         0
Block                                        0
Lot                                          0
Street Number                                0
Street Name                                  0
Street Suffix                                0
Description                                  0
Current Status                               0
Current Status Date                          0
Filed Date                                   0
Issued Date                                635
Completed Date                            6235
First Construction Document Date           639
Number of Existing Stories                3189
Number of Proposed Stories                3386
Permit Expiration Date                    3460
Estimated Cost                               0
Revised Cost                                 0
Existing Use 

# **Fill missing Completion Date with the highest date and missing Issue Date with the lowest date**


In [19]:
# Fill missing Completion Date with the highest date
df2['Completed Date'] = pd.to_datetime(df2['Completed Date'], errors='coerce')  # Convert to datetime
max_completion_date = df2['Completed Date'].max()  # Find the highest date
df2['Completed Date'].fillna(max_completion_date, inplace=True)  # Fill missing values

# Fill missing Issue Date with the lowest date
df2['Issued Date'] = pd.to_datetime(df2['Issued Date'], errors='coerce')  # Convert to datetime
min_issue_date = df2['Issued Date'].min()  # Find the lowest date
df2['Issued Date'].fillna(min_issue_date, inplace=True)

Fill missing

In [20]:
# Fill missing Completion Date with the highest date
df2['Permit Expiration Date'] = pd.to_datetime(df2['Permit Expiration Date'], errors='coerce')  # Convert to datetime
max_completion_date = df2['Permit Expiration Date'].max()  # Find the highest date
df2['Permit Expiration Date'].fillna(max_completion_date, inplace=True)  # Fill missing values

In [21]:

# Fill missing Completion Date with the highest date
df2['First Construction Document Date'] = pd.to_datetime(df2['First Construction Document Date'], errors='coerce')  # Convert to datetime
max_completion_date = df2['First Construction Document Date'].max()  # Find the highest date
df2['First Construction Document Date'].fillna(max_completion_date, inplace=True)  # Fill missing values

In [22]:
# Calculate the mean of the "Number of Existing Stories" column as an integer
mean_existing_stories = int(df['Number of Existing Stories'].mean())

# Fill missing values with the mean as an integer
df2['Number of Existing Stories'].fillna(mean_existing_stories, inplace=True)

In [23]:
df2['Number of Proposed Stories'].fillna(0, inplace=True)

In [24]:
df2['Proposed Units'].fillna(0, inplace=True)

In [25]:
df2['Existing Units'].fillna(0, inplace=True)

In [26]:
df2['Plansets'].fillna(0, inplace=True)

In [27]:

df2['Existing Use'].fillna("No Existing Use yet", inplace=True)

In [28]:
df2['Proposed Use'].fillna("No Proposed Use yet", inplace=True)

In [29]:
excel_file = "construction2.xlsx"
df2.to_excel(excel_file, index=False)

print(f"DataFrame has been exported to {excel_file}")

DataFrame has been exported to construction2.xlsx
