In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Analysis
In this section we will examine various facets of the Json file data, such as the number of features, their respective data types, and the nature of values contained within them.

In [2]:
import json
file_path = "/content/drive/MyDrive/Granular Assignment/train_qFabric.geojson"
with open(file_path, 'r') as json_file:
    data = json.load(json_file)

In [8]:
# Dump the json file into pandas dataframe
import pandas as ps
df = pd.json_normalize(data['features'])

In [9]:
df.head()

Unnamed: 0,type,properties.index,properties.change_type,properties.change_status_date1,properties.change_status_date2,properties.change_status_date3,properties.change_status_date4,properties.change_status_date5,properties.date1,properties.date2,properties.date3,properties.date4,properties.date5,properties.urban_types,properties.geography_types,geometry.type,geometry.coordinates
0,Feature,0,Commercial,Land Cleared,Construction Midway,Construction Done,Construction Done,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Industrial,"River,Sparse Forest,Grass Land",Polygon,"[[[116.97563423609992, 38.890015408908404], [1..."
1,Feature,1,Commercial,Greenland,Greenland,Construction Done,Construction Done,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Sparse Urban,"Sparse Forest,Grass Land",Polygon,"[[[116.97499988544186, 38.88969278828093], [11..."
2,Feature,2,Commercial,Land Cleared,Land Cleared,Construction Done,Construction Done,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Sparse Urban,"Sparse Forest,Grass Land",Polygon,"[[[116.97519019063928, 38.88847244069008], [11..."
3,Feature,3,Commercial,Land Cleared,Land Cleared,Construction Midway,Construction Midway,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Industrial,"River,Sparse Forest,Grass Land",Polygon,"[[[116.9763003042909, 38.89016970573023], [116..."
4,Feature,4,Commercial,Land Cleared,Land Cleared,Construction Started,Construction Midway,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Industrial,"River,Sparse Forest,Grass Land",Polygon,"[[[116.97750557054124, 38.89036608350347], [11..."


In [18]:
# print the unique values in each column
non_unique_cols = ['geometry.type', 'geometry.coordinates', 'properties.date1',	'properties.date2',	'properties.date3',	'properties.date4',	'properties.date5', 'properties.index', 'type','properties.geography_types' ]
for col in df.columns:
  if col not in non_unique_cols:
    print(col, " : ", df[col].unique())
    print('\n')

properties.change_type  :  ['Commercial' 'Residential' 'Demolition' 'Road' 'Industrial'
 'Mega Projects']


properties.change_status_date1  :  ['Land Cleared' 'Greenland' 'Materials Dumped' 'Excavation'
 'Construction Midway' 'NA' 'Prior Construction' 'Construction Started'
 'Construction Done' 'Operational']


properties.change_status_date2  :  ['Construction Midway' 'Greenland' 'Land Cleared' 'Construction Started'
 'Construction Done' 'Materials Dumped' 'NA' 'Prior Construction'
 'Excavation' 'Operational']


properties.change_status_date3  :  ['Construction Done' 'Construction Midway' 'Construction Started'
 'Materials Dumped' 'Land Cleared' 'Excavation' 'Greenland' 'NA'
 'Prior Construction' 'Operational']


properties.change_status_date4  :  ['Construction Done' 'Construction Midway' 'Construction Started'
 'Land Cleared' 'Materials Dumped' 'Greenland' 'NA' 'Excavation'
 'Prior Construction' 'Operational']


properties.change_status_date5  :  ['Construction Done' 'Construction Mi

In [12]:
# DATE data type testing for date features
from datetime import datetime
vals = data['features'][0]['properties']
date1 = datetime.strptime(vals['date1'], '%d-%m-%Y').date() if vals.get('date1') else None
id = int(vals['index'])
print(date1)
print(type(date1))
print(id)
print(type(id))

2014-05-29
<class 'datetime.date'>
0
<class 'int'>


# SQLite Database generation
In this section we will dump the data from Json file to SQlite database. The database will contain 1 Table with 16 different columns corresponding to each feature of the dataset. The database schema is present below .
<br>
<pre>
CREATE TABLE QFabric (
  id INTEGER PRIMARY KEY,
  change_type  TEXT,
  change_status_date1  TEXT,
  change_status_date2  TEXT,
  change_status_date3  TEXT,  
  change_status_date4  TEXT,
  change_status_date5  TEXT,
  date1  DATE,
  date2  DATE,
  date3  DATE,
  date4  DATE,
  date5  DATE,
  urban_types  TEXT,
  geography_types  TEXT,
  geometry_type  TEXT,
  coordinates  TEXT
);
</pre>

<b></u>Important Notes</b></u>
- All the text in the dataset is transformed to lowercase characters for ease of accessibility.
- The python list of coordinates are stored in the form of TEXT data type
- Supported data types in SQLite database
  - INTEGER
  - REAL
  - TEXT
  - BLOB


In [4]:
import sqlite3
from datetime import datetime

In [5]:
# Initialize the database
con = sqlite3.connect("QFabric_v0.db")

In [6]:
cur = con.cursor()

In [7]:
# Create Table named QFabric
cur.execute("CREATE TABLE QFabric(id INTEGER PRIMARY KEY, change_type, change_status_date1, change_status_date2,change_status_date3, change_status_date4, change_status_date5, date1 DATE, date2 DATE, date3 DATE, date4 DATE, date5 DATE,urban_types, geography_types, geometry_type, coordinates)")

<sqlite3.Cursor at 0x7897f9da96c0>

In [7]:
# Check for created table
res = cur.execute("SELECT name FROM sqlite_master")
res.fetchone()

('QFabric',)

In [11]:
# Insert the data from json to SQL table with required data types

for item in data['features']:
  vals = item['properties']
  # Converting the coordinates python list into serialized string compatible with SQlite
  serialized_list = json.dumps(item['geometry']['coordinates'])
  geometry_type_str = str(item['geometry']['type'])

  # Converting dates from string type to Date time data type in the SQlite database
  date1 = datetime.strptime(vals['date1'], "%d-%m-%Y").strftime("%Y-%m-%d") if vals.get('date1') else None
  date2 = datetime.strptime(vals['date2'], "%d-%m-%Y").strftime("%Y-%m-%d") if vals.get('date2') else None
  date3 = datetime.strptime(vals['date3'], "%d-%m-%Y").strftime("%Y-%m-%d") if vals.get('date3') else None
  date4 = datetime.strptime(vals['date4'], "%d-%m-%Y").strftime("%Y-%m-%d") if vals.get('date4') else None
  date5 = datetime.strptime(vals['date5'], "%d-%m-%Y").strftime("%Y-%m-%d") if vals.get('date5') else None

  # Convert index string to integer data type
  id = int(vals['index'])

  cur.execute("INSERT INTO QFabric (id, change_type, change_status_date1, change_status_date2,change_status_date3, change_status_date4, change_status_date5, date1, date2, date3, date4, date5, urban_types, geography_types, geometry_type, coordinates) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
              (id,
               vals['change_type'].lower(),
               vals['change_status_date1'].lower(),
               vals['change_status_date2'].lower(),
               vals['change_status_date3'].lower(),
               vals['change_status_date4'].lower(),
               vals['change_status_date5'].lower(),
               date1,
               date2,
               date3,
               date4,
               date5,
               vals['urban_types'].lower(),
               vals['geography_types'].lower(),
               geometry_type_str,
               serialized_list))


In [12]:
# Commit the changes to final DB instance
con.commit()

## Sample SQL Queries

In [15]:
res = cur.execute("SELECT COUNT(*) FROM QFabric WHERE (julianday(date2) - julianday(date1))>=0")
ans = res.fetchall()
ans

[(310006,)]