# Given a vector file, create training data based in generating a grid of points within extent of vector file, intersect those points with the polygons in the vector file to assign them the class

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely.geometry import Polygon

In [2]:
#filename_vector = "/LUSTRE/MADMEX/tasks/2020/4_landcover_manglares_check/train_fc_quintana_roo_s2_2018_madmex_17_clases_59_-31.shp"

filename_vector = "/LUSTRE/MADMEX/products/landcover/sentinel2/2018/estados/Quintana_Roo/17_31_gpkg/Quintana_Roo_sentinel2_2018.gpkg"

In [3]:
gdf = gpd.read_file(filename_vector)

In [4]:
gdf.head()

Unnamed: 0,clave_31,descr_17,clave_17,area,descr_31,geometry
0,29,Urbano y construido,15,0.18,Urbano y Construido,"POLYGON ((3929392.078 846107.703, 3929392.078 ..."
1,8,Manglar y petén,4,0.61,Manglar y Petén,"POLYGON ((3929422.078 846007.703, 3929422.078 ..."
2,28,Tierras agrícolas,14,20.072754,Tierras Agrícolas,"MULTIPOLYGON (((3929502.079 846348.239, 392950..."
3,22,Vegetación acuática menor,10,2.99,Tular,"POLYGON ((3929802.078 846077.703, 3929802.078 ..."
4,12,Selvas secas,6,0.043354,Selva Mediana Caducifolia y Subcaducifolia,"POLYGON ((3923392.079 845771.530, 3923392.079 ..."


**Get extent:**

In [5]:
bbox = gdf.total_bounds

In [6]:
p1 = Point(bbox[0], bbox[3])
p2 = Point(bbox[2], bbox[3])
p3 = Point(bbox[2], bbox[1])
p4 = Point(bbox[0], bbox[1])

In [7]:
(p1, p2, p3, p4)

(<shapely.geometry.point.Point at 0x7f2d28739828>,
 <shapely.geometry.point.Point at 0x7f2d287397b8>,
 <shapely.geometry.point.Point at 0x7f2d287399e8>,
 <shapely.geometry.point.Point at 0x7f2d28734a58>)

In [8]:
#np1 = (p1.coords.xy[0][0], p1.coords.xy[1][0])
#np2 = (p2.coords.xy[0][0], p2.coords.xy[1][0])
#np3 = (p3.coords.xy[0][0], p3.coords.xy[1][0])
#np4 = (p4.coords.xy[0][0], p4.coords.xy[1][0])

In [9]:
#np1

In [10]:
#np2

In [11]:
#np3

In [12]:
#np4

In [13]:
x_min, x_max = p1.coords.xy[0][0], p3.coords.xy[0][0]
y_min, y_max = p3.coords.xy[1][0], p1.coords.xy[1][0]

In [14]:
x_min, x_max 

(3827283.759597446, 4082997.360296603)

In [15]:
y_min, y_max 

(719640.0231194737, 1145298.318330552)

**35 dc tiles covers region of Quintana Roo, 10e4 points for each dc tile should be enough which amounts to 350,000 points.
Will use approximately 600 per dimension which amounts to 600*600 = 360,000**

In [16]:
n_sample = 600

In [17]:
x_points = np.linspace(x_min, x_max, n_sample)

In [18]:
y_points = np.linspace(y_min, y_max, n_sample)

In [19]:
#x_points, y_points = np.mgrid[x_min:x_max:(n_sample*1j), y_min:y_max:(n_sample*1j)]

In [20]:
#x_points

In [21]:
#x_points_ravel = np.ravel(x_points[0])
#y_points_ravel = np.ravel(y_points[1])

In [22]:
len(y_points)

600

In [23]:
df_points = pd.DataFrame(columns = ['X', 'Y'])

In [24]:
df_points

Unnamed: 0,X,Y


In [25]:
for value in y_points:
    df_aux = pd.DataFrame({'X': x_points,
                           'Y': np.repeat(value, n_sample)})
    df_points = pd.concat([df_points,
                           df_aux],
                          ignore_index = True)

In [26]:
df_points.head()

Unnamed: 0,X,Y
0,3827284.0,719640.023119
1,3827711.0,719640.023119
2,3828138.0,719640.023119
3,3828564.0,719640.023119
4,3828991.0,719640.023119


In [27]:
df_points['coords'] = list(zip(df_points['X'], df_points['Y']))

In [28]:
df_points['coords'] = df_points['coords'].apply(Point)

In [29]:
df_points

Unnamed: 0,X,Y,coords
0,3.827284e+06,7.196400e+05,POINT (3827283.759597446 719640.0231194737)
1,3.827711e+06,7.196400e+05,POINT (3827710.660433338 719640.0231194737)
2,3.828138e+06,7.196400e+05,POINT (3828137.561269229 719640.0231194737)
3,3.828564e+06,7.196400e+05,POINT (3828564.462105121 719640.0231194737)
4,3.828991e+06,7.196400e+05,POINT (3828991.362941013 719640.0231194737)
...,...,...,...
359995,4.081290e+06,1.145298e+06,POINT (4081289.756953036 1145298.318330552)
359996,4.081717e+06,1.145298e+06,POINT (4081716.657788928 1145298.318330552)
359997,4.082144e+06,1.145298e+06,POINT (4082143.558624819 1145298.318330552)
359998,4.082570e+06,1.145298e+06,POINT (4082570.459460711 1145298.318330552)


In [30]:
df_points.drop(['X', 'Y'], axis = 1, inplace = True)

In [31]:
df_points

Unnamed: 0,coords
0,POINT (3827283.759597446 719640.0231194737)
1,POINT (3827710.660433338 719640.0231194737)
2,POINT (3828137.561269229 719640.0231194737)
3,POINT (3828564.462105121 719640.0231194737)
4,POINT (3828991.362941013 719640.0231194737)
...,...
359995,POINT (4081289.756953036 1145298.318330552)
359996,POINT (4081716.657788928 1145298.318330552)
359997,POINT (4082143.558624819 1145298.318330552)
359998,POINT (4082570.459460711 1145298.318330552)


In [32]:
gdf_points = gpd.GeoDataFrame(df_points, geometry='coords')

In [33]:
#gdf_points['coords'].plot()
#plt.show()

In [34]:
gdf.crs

{'proj': 'lcc',
 'lat_1': 17.5,
 'lat_2': 29.5,
 'lat_0': 12,
 'lon_0': -102,
 'x_0': 2500000,
 'y_0': 0,
 'datum': 'WGS84',
 'units': 'm',
 'no_defs': True,
 'wktext': True}

In [35]:
gdf_points.crs = gdf.crs

# Intersect with training data and assing code

In [36]:
gdf_points_intersect = gpd.sjoin(gdf_points,
                                 gdf,
                                 op = "intersects",
                                 how = "inner")

In [37]:
gdf_points_intersect.reset_index(drop=True,inplace=True)

In [38]:
gdf_points_intersect

Unnamed: 0,coords,index_right,clave_31,descr_17,clave_17,area,descr_31
0,POINT (3887049.877 720350.638),48284,29,Urbano y construido,15,62.64,Urbano y Construido
1,POINT (3887476.777 720350.638),48270,7,Selvas húmedas,5,317.78,"Selva Baja y Mediana Subperennifolia, Bosque d..."
2,POINT (3888330.579 720350.638),48270,7,Selvas húmedas,5,317.78,"Selva Baja y Mediana Subperennifolia, Bosque d..."
3,POINT (3889184.381 720350.638),48270,7,Selvas húmedas,5,317.78,"Selva Baja y Mediana Subperennifolia, Bosque d..."
4,POINT (3889611.282 720350.638),48270,7,Selvas húmedas,5,317.78,"Selva Baja y Mediana Subperennifolia, Bosque d..."
...,...,...,...,...,...,...,...
146709,POINT (4039453.475 1143877.089),163348,31,Agua,17,19.63,Agua
146710,POINT (4031342.359 1144587.703),163243,29,Urbano y construido,15,3.04,Urbano y Construido
146711,POINT (4033049.962 1144587.703),163213,31,Agua,17,1.09,Agua
146712,POINT (4033903.764 1144587.703),163221,29,Urbano y construido,15,7.52,Urbano y Construido


In [39]:
name_of_column = "code"
name_of_column = "clave_17"

In [40]:
columns_to_keep = [column for column in list(gdf_points_intersect.columns) if column in ["coords", name_of_column]]

In [41]:
columns_to_keep

['coords', 'clave_17']

In [42]:
gdf_points_intersect = gdf_points_intersect[columns_to_keep]

In [43]:
gdf_points_intersect.head()

Unnamed: 0,coords,clave_17
0,POINT (3887049.877 720350.638),15
1,POINT (3887476.777 720350.638),5
2,POINT (3888330.579 720350.638),5
3,POINT (3889184.381 720350.638),5
4,POINT (3889611.282 720350.638),5


In [44]:
path = "/LUSTRE/MADMEX/tasks/2020/4_landcover_manglares_check/"

In [45]:
filename_points_training_data = path + "training_data_points_quintana_roo_s2_2018_madmex_17_classes.shp"

In [46]:
filename_points_training_data

'/LUSTRE/MADMEX/tasks/2020/4_landcover_manglares_check/training_data_points_quintana_roo_s2_2018_madmex_17_classes.shp'

In [47]:
gdf_points_intersect.to_file(filename_points_training_data)

In [48]:
gdf_points_intersect["clave_17"].unique()

array([15,  5, 14, 13, 17, 10,  4,  6, 16, 11])

In [49]:
gdf_points_intersect["clave_17"].value_counts()

5     104540
6      12330
10      8391
17      6120
14      5995
4       5422
15      2241
13      1082
16       475
11       118
Name: clave_17, dtype: int64