In [0]:
# importing the packages and functions needed
from boto3 import client
from tempfile import TemporaryFile, TemporaryDirectory
from zipfile import ZipFile
from pandas import concat, read_csv, DataFrame, Series
from typing import List
#from Levenshtein import distance as lev

In [0]:
ccod = read_csv(
    "/dbfs/mnt/lab/restricted/ESD-Project/source_data_gov_hm_land_registry/dataset_use_land_and_property_data/format_CSV_use_land_and_property_data/LATEST_use_land_and_property_data/CCOD_FULL_2024_01.csv",
    
    usecols=[
        "Title Number",
        "Proprietor Name (1)",
        "Company Registration No. (1)",
        "Proprietor (1) Address (1)",
        ],

    converters={
        "Proprietor Name (1)": str.lower,
        "Proprietor (1) Address (1)": str.lower,
        }

)
ccod.columns = [
    "title_no",
    "pr_name",
    "company_reg_no",
    "pr_address",
]


In [0]:
ccod

In [0]:
#extracting the postcodes
ccod_lp = ccod.assign(
    postcode=ccod["pr_address"].str.extract(r"([a-z]{1,2}[\d]{1,2}[a-z]{0,1} [\d]{1}[a-z]{2})$")
    )


## Functions

In [0]:
def kwd_df(word: str, column: str, df: DataFrame):

    """Make a subsetted dataframe of the given dataframe with a string to search in a specific column"""

    return df[df[column].str.contains(word, regex=False, na=False)]


In [0]:
def new_df(names: List, new_name: str, index=0):

    """Creating a DataFrame with the proprietor names isolated and the lookup name."""

    if index == 0:
        y = DataFrame({"CCOD": names, "LC": new_name})

    else:
        n_list = [names[i] for i in index]

        y = DataFrame({"CCOD": n_list, "LC": new_name})

    return y


In [0]:
def unique_values(word: str, s_column: str, v_column: str, df: DataFrame):

    """Get the unique values of a column in a dataframe after searching by a string in a specified column"""

    return df[df[s_column].str.contains(word, regex=False, na=False)][v_column].unique()


In [0]:
def append_rows(names: List, df: DataFrame):

    """Add new rows to an existing Lookup dataframe only if the values do not already exist in the dataframe"""

    series_names = Series(names)

    y = series_names[~series_names.isin(df["CCOD"])]

    if y.empty == True:

        return df

    else:
        z = y.to_frame()
        z.columns = ["CCOD"]
        z["LC"] = df["LC"][0]

        return concat([df, z], ignore_index = True)


In [0]:
def multi_search (words : List, column : str, df : DataFrame):

    '''Search multiple strings independently and return a subsetted dataframe of the resulting values'''

    p_df = []
    
    for x in words:

        y = kwd_df(word = x, column = column, df = df)

        p_df.append(y)

    return concat (p_df).drop_duplicates()


In [0]:
# def lev_search (word : str, distance : int, df : DataFrame):

#     '''Search a string with levenshtein distance function on pr_name column of the dataframe.'''
#     '''The numbers are then joined on to the dataframe after it's been filtered by the distance specified '''

#     df.reset_index(inplace = True)

#     l_lev = []
    
#     for i in df["pr_name"]:
        
#         z = lev(word, i)
#         l_lev.append(z)
    
#     s_bcl = Series(l_lev).to_frame()
#     lev_hwt = s_bcl[lambda df: df < distance ]
#     lev_bc = df.join(lev_hwt)

#     return lev_bc.dropna()

## Landowner Names

In [0]:
#manually searching for strings

#ccod_l[ccod_l["pr_address"].str.contains("bs34 8jh", regex=False, na=False)]["pr_name"].unique()
#mod[mod["company_reg_no"].isna()]["pr_name"].unique()
#m[~m.index.isin(y.index)]
#m[m["company_reg_no"].isna()]

In [0]:
# a list to hold the final dataframes together

lookup_table = []

##### MOD

In [0]:
m = kwd_df("defence", "pr_name", ccod_lp)

In [0]:
ma = ["state", "ministry", "secretary", "minister"]

In [0]:
ms = multi_search(ma, "pr_name", m)["pr_name"].unique()
ms

In [0]:
mod = new_df(ms[:6], "Ministry of Defence")

In [0]:
mp = multi_search(ma, "pr_name", m)["postcode"].unique()
mp

In [0]:
ms1 = multi_search(mp, "postcode", ccod_lp)["pr_name"].unique()
ms1

In [0]:
mod

In [0]:
lookup_table.append(mod)

#### DEFRA

In [0]:
d = kwd_df ("environment", "pr_name", ccod_lp)


In [0]:
de = kwd_df ("agriculture", "pr_name", ccod_lp)

In [0]:
da = ["food", "rural", "affairs", "minister", "ministry", "state", "secretary"]

In [0]:
ds = multi_search(da, "pr_name", d)["pr_name"].unique()
print(len(ds))
ds

In [0]:
defra = new_df(ds, "DEFRA")
defra

In [0]:
ds1 = multi_search(da, "pr_name", de)["pr_name"].unique()
ds1

In [0]:
defra1 = append_rows (ds1[:-1], defra)
defra1

In [0]:
dp = multi_search(da, "pr_name", d)["postcode"].unique()
dp

In [0]:
len(dp)

In [0]:
ds1 = multi_search(dp[:11], "postcode", ccod_lp)["pr_name"].unique()
ds1

In [0]:
n = [8, 10, 11, 12, 13, 20, 21, 23, 32, 35, 44, 46, 48, 49, 50, 52, 53, 55, 62, 64, 66, 67, 68, 70, 73, 74, 76, 77, 79, 80, 81, 83, 84, 88, 90]

In [0]:
import numpy

ds1_n = numpy.delete(ds1, n)

In [0]:
ds1_n

In [0]:
defra2 = append_rows (ds1_n, defra1)
defra2

In [0]:
defra2.drop([47, 48, 49, 50, 51, 52, 53], inplace=True)


In [0]:
defra3 = defra2.reset_index(drop=True)


In [0]:
defra3

In [0]:
ccod_l[ccod_l["pr_name"].str.contains("nature england", regex=False, na=False)]

In [0]:
ds2 = multi_search(dp[11:21], "postcode", ccod_lp)["pr_name"].unique()
ds2

In [0]:
n_de = new_df (ds2, "DEFRA", [5, 22, 29, 35, 36, 46, 48])["CCOD"].unique()

In [0]:
defra4 = append_rows(n_de, defra3)

In [0]:
defra4

In [0]:
ds3 = multi_search(dp[21:31], "postcode", ccod_lp)["pr_name"].unique()
ds3

In [0]:
ds4 = multi_search(dp[31:41], "postcode", ccod_lp)["pr_name"].unique()
ds4

In [0]:
lookup_table.append(defra4)

#### Wildlife Trust

In [0]:
t = kwd_df("trust", "pr_name", ccod_lp)

In [0]:
wt = ["wild life", "wildlife"]

In [0]:
ws = multi_search(wt, "pr_name", t)["pr_name"].unique()
ws

In [0]:
wlt = new_df(ws, "Wildlife Trust")

In [0]:
wp = multi_search(ws, "pr_name", t)["company_reg_no"].unique()
wp

In [0]:
wp1 = multi_search(wp[1:], "company_reg_no", ccod_lp)["pr_name"].unique()
wp1

In [0]:
wp_s =Series(wp1)
wp_a = wp_s[wp_s.str.contains("trust", regex=False, na=False)].values

In [0]:
wlt1 = append_rows(wp_a, wlt).drop(177, axis = 0)

In [0]:
lookup_table.append(wlt1)

#### Woodland Trust

In [0]:
wd = ["woodland", "wood land"]

In [0]:
wds = multi_search(ma, "pr_name", t)["pr_name"].unique()
wds

In [0]:
wldt = new_df(wds[:20], "Woodland Trust")
wdldt = wldt.drop([1,2,5,6,14,15, 16], axis=0).reset_index(drop=True)
wdldt

In [0]:
wldp = multi_search(wdldt["CCOD"].values, "pr_name", t)["postcode"].unique()
wldp

In [0]:
wld1 = multi_search(wldp, "postcode", ccod_lp)["pr_name"].unique()
wld1

In [0]:
wdldt1 = append_rows(wld1[2:4],wdldt)
wdldt1

In [0]:
lookup_table.append(wdldt1)

#### MOJ

In [0]:
j = kwd_df("justice", "pr_name", ccod_lp)

In [0]:
ja = ["state", "ministry", "secretary", "minister"]

In [0]:
js = multi_search(ja, "pr_name", j)["pr_name"].unique()
js

In [0]:
moj = new_df(js, "Ministry of Justice")

In [0]:
jp = multi_search(js, "pr_name", j)["postcode"].unique()
jp

In [0]:
jp1 = multi_search(jp, "postcode", ccod_lp)["pr_name"].unique()
jp1

In [0]:
moj

In [0]:
lookup_table.append(moj)

#### Hawk and Owl Trust

In [0]:
hwo = ["hawk and owl", "hawk & owl", "hawkowl"]

In [0]:
hs = multi_search(hwo, "pr_name", t)["pr_name"].unique()
hs

In [0]:
unique_values("hawk and owl trust", "pr_name", "pr_name", ccod_lp)

In [0]:
hwkot = new_df(hs, "Hawk and Owl Trust")

In [0]:
hop = multi_search(hs, "pr_name", t)["company_reg_no"].unique()
hop

In [0]:
hop1 = multi_search(hop[:2], "company_reg_no", ccod_lp)["pr_name"].unique()
hop1

In [0]:
hwkot

In [0]:
lookup_table.append(hwkot)

#### Plantlife

In [0]:
p = kwd_df("plant", "pr_name", ccod_lp)

In [0]:
pa = ["plantlife", "plant life", "life"]

In [0]:
ps = multi_search(pa, "pr_name", p)["pr_name"].unique()
ps

In [0]:
plal = new_df(ps, "PlantLife")

In [0]:
pp = multi_search(ps, "pr_name", m)["postcode"].unique()
pp

In [0]:
pp1 = multi_search(pp, "postcode", ccod_lp)["pr_name"].unique()
pp1

In [0]:
pp2 = multi_search(ps, "pr_name", m)["company_reg_no"].unique()
pp2

In [0]:
pp3 = multi_search(pp2[1:], "company_reg_no", ccod_lp)["pr_name"].unique()
pp3

In [0]:
plal1 = append_rows(pp3, plal)

In [0]:
lookup_table.append(plal1)

#### Wildfowl and Wetlands Trust

In [0]:
wfwl = ["wildfowl", "wetland"]

In [0]:
wfs = multi_search(wfwl, "pr_name", t)["pr_name"].unique()
wfs

In [0]:
wlfwtlt = new_df([wfs[3], wfs[5]], "Wildfowl and Wetlands Trust")
wlfwtlt

In [0]:
wfp = multi_search(wlfwtlt["CCOD"].values, "pr_name", t)["postcode"].unique()
wfp

In [0]:
wfpp = multi_search(wfp, "postcode", ccod_lp)["pr_name"].unique()
wfpp

In [0]:
wlfwtlt1 = append_rows(wfpp, wlfwtlt)
wlfwtlt1

In [0]:
lookup_table.append(wlfwtlt1)

#### Butterfly Conservation

In [0]:
c = kwd_df("conserv", "pr_name", ccod_lp)

In [0]:
bca = ["butterfly", "butterfly conservation"]

In [0]:
bcs = multi_search(bca, "pr_name", c)["pr_name"].unique()
bcs

In [0]:
buco = new_df(bcs, "Butterfly Conservation")

In [0]:
bcp = multi_search(bcs, "pr_name", c)["postcode"].unique()
bcp

In [0]:
bcp1 = multi_search(bcp, "postcode", ccod_lp)["pr_name"].unique()
bcp1

In [0]:
bcp2 = multi_search(bcs, "pr_name", c)["company_reg_no"].unique()
bcp2

In [0]:
bcp3 = multi_search([bcp2[0], bcp2[2]], "company_reg_no", ccod_lp)["pr_name"].unique()
bcp3

In [0]:
buco

In [0]:
lookup_table.append(buco)

#### RSPB

In [0]:
b = kwd_df("bird", "pr_name", ccod_lp)

In [0]:
ba = ["royal", "society", "protection"]

In [0]:
bs = multi_search(ba, "pr_name", b)["pr_name"].unique()
bs

In [0]:
rspb = new_df(bs[:3], "RSPB")
rspb

In [0]:
bp = multi_search(rspb["CCOD"].values, "pr_name", b)["postcode"].unique()
bp

In [0]:
bp1 = multi_search(bp, "postcode", ccod_lp)["pr_name"].unique()
bp1

In [0]:
bp2 = multi_search(rspb["CCOD"].values, "pr_name", b)["company_reg_no"].unique()
bp2

In [0]:
bp3 = multi_search(bp2[1:], "company_reg_no", ccod_lp)["pr_name"].unique()
bp3

In [0]:
rspb

In [0]:
lookup_table.append(rspb)

### National Trust

In [0]:
nt = kwd_df("national trust", "pr_name", ccod_lp)

In [0]:
nt["pr_name"].unique()

In [0]:
nta = ["national trust company", "historic", "the national trust"]

In [0]:
nts = multi_search(nta, "pr_name", nt)["pr_name"].unique()

In [0]:
nts

In [0]:
ntdf = new_df(nts[4:], "National Trust")
ntdf

In [0]:
ntp = multi_search(ntdf["CCOD"].values, "pr_name", ccod_lp)["postcode"].unique()
ntp

In [0]:
ntp1 = multi_search(ntp[3:], "postcode", ccod_lp)["pr_name"].unique()
ntp1

In [0]:
ntdf1 = append_rows([ntp1[9],ntp1[-1]], ntdf)

In [0]:
ntp2 = multi_search(ntp[:2], "postcode", ccod_lp)["pr_name"].unique()
ntp2

In [0]:
ntdf2 = append_rows(ntp2, ntdf1).drop([16])
ntdf2

In [0]:
ntp3 = multi_search(ntdf2["CCOD"].values, "pr_name", ccod_lp)["company_reg_no"].unique()
ntp3

In [0]:
ntp4 = multi_search(ntp3[1:], "company_reg_no", ccod_lp)["pr_name"].unique()
ntp4

In [0]:
lookup_table.append(ntdf2)

### Forestry Commission

In [0]:
f = kwd_df("forest", "pr_name", ccod_lp)

In [0]:
fa = ["commission", "england", "public"]

In [0]:
fs = multi_search(fa, "pr_name", f)["pr_name"].unique()
fs

In [0]:
ccod_lp[ccod_lp["pr_name"].str.contains(fs[0], regex=False, na=False)]["postcode"].unique()

In [0]:
ccod_lp[ccod_lp["pr_name"].str.contains(fs[0], regex=False, na=False)]["company_reg_no"].unique()

In [0]:
fs_ad = ccod_lp[ccod_lp["pr_name"].str.contains(fs[0], regex=False, na=False)]["pr_address"].unique()

In [0]:
fs_ad

In [0]:
kwd_df(fs_ad[0], "pr_address", ccod_lp)

In [0]:
fsdf = new_df(fs, "Forestry Commission", [0])
fsdf

In [0]:
lookup_table.append(fsdf)

### Environment Agency

In [0]:
ea = kwd_df("environment agency", "pr_name", ccod_lp)
ea

In [0]:
ea["pr_name"].unique()

In [0]:
eadf = new_df(ea["pr_name"].unique(), "Environment Agency")
eadf

In [0]:
ea_p = ea["postcode"].unique()
ea_p

In [0]:
eap = multi_search(ea_p[1:], "postcode", ccod_lp)["pr_name"].unique()
eap

In [0]:
eadf1 = append_rows(eap, eadf).drop([4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 23, 24, 25]).reset_index(drop = True)
eadf1

In [0]:
eap2 = multi_search(eadf1["CCOD"].values, "pr_name", ccod_lp)["company_reg_no"].unique()
eap2

In [0]:
eap3 = multi_search(eap2[1:3], "company_reg_no", ccod_lp)["pr_name"].unique()
eap3

In [0]:
kwd_df("wa4 1hg", "postcode", ccod_lp)["pr_name"].unique()

In [0]:
eap4 = multi_search(eadf1["CCOD"].values, "pr_name", ccod_lp)["postcode"].unique()
eap4

In [0]:
eap5 = multi_search(eap4[1:], "postcode", ccod_lp)["pr_name"].unique()
eap5

In [0]:
lookup_table.append(eadf1)

In [0]:
owners = concat (lookup_table, ignore_index = True)

In [0]:
owners["LC"].unique()

In [0]:
from pandas import read_csv

csv = read_csv("s3://s3-ranch-019/30-by-30/ownership_lookup.csv")

In [0]:
csv

In [0]:
lookup_csv = concat([csv, owners])

In [0]:
lookup_csv

In [0]:
lookup_csv.to_csv("ownership_lookup.csv", index = False)

In [0]:
from boto3 import resource

s3 = resource("s3")

bucket = s3.Bucket("s3-ranch-019")
bucket.upload_file("/home/piumialgamagedona/30-by-30-Analysis/Notebooks/ownership_lookup.csv", "30-by-30/ownership_lookup.csv", ExtraArgs={'ACL': 'bucket-owner-full-control'})