# Initialize

In [1]:
# Imports

import warnings
warnings.filterwarnings('ignore')

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import camelot

In [2]:
# Get and print the current working directory
current_dir = os.getcwd()
print('Current Working Directory:', current_dir)

# Update and print current working directory
temp_dirpath = os.path.join('D:\\', 'Akshaya')
os.chdir(temp_dirpath)
updated_current_dir = os.getcwd()
print("Updated Current Working Directory:", updated_current_dir)

# Set and print the raw DATA directory
rawDATA_dir = os.path.join(updated_current_dir, 'DATA')
print('Raw DATA Directory:', rawDATA_dir)

# Set and print the Output directory
output_dir = os.path.join(updated_current_dir, 'LULC_dataset', 'jupyterNB_outputs')
print('Output Directory:', output_dir)

Current Working Directory: D:\Akshaya\LULC_dataset
Updated Current Working Directory: D:\Akshaya
Raw DATA Directory: D:\Akshaya\DATA
Output Directory: D:\Akshaya\LULC_dataset\jupyterNB_outputs


In [3]:
# Funtion to create a new folder at specified folder path
def create_folder(folder_path, folder_name):
    try:
        # Join folder path and folder name to create the full path
        full_path = os.path.join(folder_path, folder_name)
        
        # Check if the folder already exists
        if not os.path.exists(full_path):
            # Create the folder
            os.makedirs(full_path)
            print(f"Folder '{folder_name}' created successfully at '{folder_path}'.")
        else:
            print(f"Folder '{folder_name}' already exists at '{folder_path}'.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")     
    return full_path
        
# Create and print Output directory for the current jupyterNB
current_output_dir = create_folder(folder_path=output_dir, folder_name='01_NRSC_LULC_Classification')
print('Current jupyterNB Output Directory:', current_output_dir)

Folder '01_NRSC_LULC_Classification' created successfully at 'D:\Akshaya\LULC_dataset\jupyterNB_outputs'.
Current jupyterNB Output Directory: D:\Akshaya\LULC_dataset\jupyterNB_outputs\01_NRSC_LULC_Classification


# Import dataset

In [4]:
# Set 'Technical Manual LULC 2nd Cycle.pdf' shapefile filepath
fpath = os.path.join(rawDATA_dir, 'LULC', 'Technical Manual LULC 2nd Cycle.pdf')
print(fpath)

# Read 'Technical Manual LULC 2nd Cycle.pdf' using camelot
pdf_file = camelot.read_pdf(fpath, pages='19-21') #address of file location

D:\Akshaya\DATA\LULC\Technical Manual LULC 2nd Cycle.pdf


In [5]:
# Print the first table as Pandas DataFrame
df0 = pdf_file[0].df
print(df0.shape)
df0.head()

(31, 8)


Unnamed: 0,0,1,2,3,4,5,6,7
0,Sl - 1,L - I,Sl - II,L - II,Sl - III,L - III,LU11_12,LU_CODE
1,1,Built Up,1.1,Urban,1.1.1,Built up - Compact \n(Continuous),1,010111
2,,,,,1.1.2,Built up - Sparse \n(Discontinuous),2,010112
3,,,,,1.1.3,Vegetated / Open \nArea,3,010109
4,,,1.2,Rural,1.2.1,Rural,4,010201


In [6]:
# Print the second table as Pandas DataFrame
df1 = pdf_file[1].df
print(df1.shape)
df1.head()

(25, 8)


Unnamed: 0,0,1,2,3,4,5,6,7
0,Sl - 1,L - I,Sl - II,L - II,Sl - III,L - III,LU11_12,LU_CODE
1,5,Wastelands,5.1,Salt Affected \nLand,5.1.1,Salt Affected Land,31,050100
2,,,5.2,Gullied / \nRavinous land,5.2.1,Gullied,32,050201
3,,,,,5.2.2,Ravinous,33,050203
4,,,5.3,Scrub land,5.3.1,Dense / closed,34,050301


In [7]:
# Concat first and second table
df = pd.concat([df0, df1], axis=0)
print(df.shape)
df.head()

(56, 8)


Unnamed: 0,0,1,2,3,4,5,6,7
0,Sl - 1,L - I,Sl - II,L - II,Sl - III,L - III,LU11_12,LU_CODE
1,1,Built Up,1.1,Urban,1.1.1,Built up - Compact \n(Continuous),1,010111
2,,,,,1.1.2,Built up - Sparse \n(Discontinuous),2,010112
3,,,,,1.1.3,Vegetated / Open \nArea,3,010109
4,,,1.2,Rural,1.2.1,Rural,4,010201


# Removing duplicates & Replacing nulls 

In [8]:
# Drop duplicates
df.drop_duplicates(inplace=True)
print(df.shape)
df.head()

(55, 8)


Unnamed: 0,0,1,2,3,4,5,6,7
0,Sl - 1,L - I,Sl - II,L - II,Sl - III,L - III,LU11_12,LU_CODE
1,1,Built Up,1.1,Urban,1.1.1,Built up - Compact \n(Continuous),1,010111
2,,,,,1.1.2,Built up - Sparse \n(Discontinuous),2,010112
3,,,,,1.1.3,Vegetated / Open \nArea,3,010109
4,,,1.2,Rural,1.2.1,Rural,4,010201


In [9]:
# Set first row as column and drop it
df.columns = df.iloc[0]
df.drop([0], axis=0, inplace=True)
print(df.shape)
df.head()

(54, 8)


Unnamed: 0,Sl - 1,L - I,Sl - II,L - II,Sl - III,L - III,LU11_12,LU_CODE
1,1.0,Built Up,1.1,Urban,1.1.1,Built up - Compact \n(Continuous),1,10111
2,,,,,1.1.2,Built up - Sparse \n(Discontinuous),2,10112
3,,,,,1.1.3,Vegetated / Open \nArea,3,10109
4,,,1.2,Rural,1.2.1,Rural,4,10201
5,,,1.3,Industrial,1.3.1,Industrial area,5,10301


In [10]:
# Replace '\n' with ''
df = df.replace('\n','', regex=True)
print(df.shape)
df.head()

(54, 8)


Unnamed: 0,Sl - 1,L - I,Sl - II,L - II,Sl - III,L - III,LU11_12,LU_CODE
1,1.0,Built Up,1.1,Urban,1.1.1,Built up - Compact (Continuous),1,10111
2,,,,,1.1.2,Built up - Sparse (Discontinuous),2,10112
3,,,,,1.1.3,Vegetated / Open Area,3,10109
4,,,1.2,Rural,1.2.1,Rural,4,10201
5,,,1.3,Industrial,1.3.1,Industrial area,5,10301


In [11]:
# Replace blank values with DataFrame.replace() method.
df = df.replace(r'^\s*$', np.nan, regex=True)
print(df.shape)
df.head()

(54, 8)


Unnamed: 0,Sl - 1,L - I,Sl - II,L - II,Sl - III,L - III,LU11_12,LU_CODE
1,1.0,Built Up,1.1,Urban,1.1.1,Built up - Compact (Continuous),1,10111
2,,,,,1.1.2,Built up - Sparse (Discontinuous),2,10112
3,,,,,1.1.3,Vegetated / Open Area,3,10109
4,,,1.2,Rural,1.2.1,Rural,4,10201
5,,,1.3,Industrial,1.3.1,Industrial area,5,10301


In [12]:
# Fill the null value using forward fill method
df.fillna(method='ffill', inplace=True)
print(df.shape)
df.head()

(54, 8)


Unnamed: 0,Sl - 1,L - I,Sl - II,L - II,Sl - III,L - III,LU11_12,LU_CODE
1,1,Built Up,1.1,Urban,1.1.1,Built up - Compact (Continuous),1,10111
2,1,Built Up,1.1,Urban,1.1.2,Built up - Sparse (Discontinuous),2,10112
3,1,Built Up,1.1,Urban,1.1.3,Vegetated / Open Area,3,10109
4,1,Built Up,1.2,Rural,1.2.1,Rural,4,10201
5,1,Built Up,1.3,Industrial,1.3.1,Industrial area,5,10301


# Export dataset

In [13]:
# Save 'df' format as .csv
filename = 'NRSC_LULC_lookuptable.csv'
filepath = os.path.join(current_output_dir, filename)
df.to_csv(filepath)