### Calculate frontline

In [308]:
# https://github.com/zhukovyuri/VIINA/blob/master/Data/events_latest.zip
# pull github link above
import requests
import zipfile
import io
r = requests.get('https://github.com/zhukovyuri/VIINA/blob/master/Data/control_latest.zip?raw=true')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

# load events_latest.csv
import pandas as pd
df = pd.read_csv(os.path.join("control_latest.csv"))

In [309]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf = gdf.set_crs("epsg:4326")
gdf = gdf.to_crs("epsg:32636")
gdf.replace("CONTESTED", "RU", inplace = True)
gdf['geometry'] = gdf.buffer(5000)
gdf = gdf.to_crs("epsg:4326")

In [None]:
from joblib import Parallel, delayed


def getDistinctAreas(gdf, col):
    #gdf['geometry'] = gdf.buffer(5000) # 5 km radius around locality
    gdf = gdf.dissolve(by = col).reset_index()
    s = gdf['geometry'][0].intersection(gdf['geometry'][1]) # intersect russian and ukrainian radii
    return s

cols = [col for col in gdf.columns if "ctr" in col]
boundaries = pd.DataFrame(columns = ['date', 'geometry'])

def getBoundary(col):
    tempdf = gdf[['geometry', col]]
    date = col[4:8] + "-" + col[8:10] + "-" + col[10:12]
    temp = getDistinctAreas(tempdf, col)
    tempdf = pd.DataFrame({"date" : [date], "geometry" : [temp]})
    return tempdf

results = Parallel(n_jobs=4, verbose = 10)(delayed(getBoundary)(col) for col in cols)
    

In [317]:
test = pd.concat(results)

test = gpd.GeoDataFrame(test, geometry = test.geometry)

In [318]:
test = test.to_wkt()
test.to_csv(os.path.join(main_dir, "data", "final", "frontline.csv"), index = False)

In [319]:
# upload frontline.shp to s3
import boto3
AWS_key = ''
AWS_secret = ''
s3=boto3.resource(  # for writing data
    's3',
    aws_access_key_id=AWS_key,
    aws_secret_access_key=AWS_secret,
)
s3.meta.client.upload_file(os.path.join(main_dir, "data", "final", "frontline.csv"), 'ipsos-dvd', 'ukr/data/frontline.csv')

#### Calculate SD of distance to front

In [17]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
    TimestampType,
)
from pyspark.sql.functions import (
    from_utc_timestamp, to_utc_timestamp, dayofyear, weekofyear, month, year, when, col, udf, row_number,
    lead, last, lag, avg, max, percentile_approx, min, unix_timestamp,
    monotonically_increasing_id, pandas_udf, PandasUDFType, asc, lit, count, concat_ws,
    countDistinct, split, regexp_replace, explode, dayofweek, hour, date_trunc, struct, expr
)


spark = SparkSession.builder.appName("front").getOrCreate()
front = spark.read.csv(os.path.join(main_dir, "data", "final", "distance_to_front"), header = True)

In [18]:
front = front.select('distance_to_front').toPandas()

                                                                                

In [23]:
front['distance_to_front'].describe()

count    3.760947e+06
mean     2.698047e+02
std      2.765611e+02
min      0.000000e+00
25%      4.258613e+01
50%      1.572292e+02
75%      4.362111e+02
max      1.120770e+03
Name: distance_to_front, dtype: float64

In [21]:
front['distance_to_front'] = front['distance_to_front'].astype(float) * 111139 / 1000

In [24]:
#print standard deviation
print(front.distance_to_front.std())
print(front.distance_to_front.mean())


276.5610572377576
269.8046590875043
