# Missing data problem

We will begin by installing the library and downloading the data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/GP211/2023-fall-class-notebooks/blob/main/in-class/Missing-data-2d.ipynb)

In [None]:
%load_ext autoreload
%autoreload 2
import sys

! python3 -m pip install  "sep_plot @ git+https://github.com/SEP-software/sep-plot.git@6331a1e36d8e3cdb4cfbc3539f31bdad1eb465a7" 


In [None]:
! wget https://github.com/GP211/2023-fall-class-notebooks/raw/main/data/s2000.5


## Read in the file

First we will look at the file and read it in two different ways, one using just python, one using pandas.

In [None]:
! cat s2000.5

In [None]:
import numpy as np
fl=open("s2000.5")
lines=fl.readlines()
x=[]
y=[]
z=[]
for ln in lines:
    vars=ln.split()
    x.append(float(vars[2]))
    y.append(float(vars[3]))
    z.append(float(vars[4]))
xar=np.array(x)
yar=np.array(y)
zar=np.array(z)

In [None]:
import pandas as pd

# Read the file into a DataFrame
df = pd.read_csv("s2000.5", delim_whitespace=True, header=None, usecols=[2, 3, 4], names=["x", "y", "z"])

# Convert columns to numpy arrays
xar = df["x"].values
yar = df["y"].values
zar = df["z"].values


## Create the model space

In [None]:
from sep_python import Hypercube
o1,d1,n1=5200,600,40
o2,d2,n2=-22000,800,40
reg_model=Hypercube.set_with_ns(ns=[n1,n2],os=[o1,o2],ds=[d1,d2],labels=["x","y"])

# Map the data to a regular grid

We are going to use the bin-2d operator we used earlier

In [None]:
import copy
from generic_solver._pyOperator import Operator
class Bin2D(Operator):

    def __init__(self, mod, dat, xy):
        """
        Initialize the binning operator.
        """
        super().__init__(mod, dat)
        hyper = mod.get_hyper()
        
        ax0 = hyper.axes[0]
        n1, o1, d1 = ax0.n, ax0.o, ax0.d
        
        ax1 = hyper.axes[1]
        n2, o2, d2 = ax1.n, ax1.o, ax1.d
        
        num_points = xy.shape[1]
        self._i1 = np.zeros(num_points, dtype=np.int32)
        self._i2 = np.zeros(num_points, dtype=np.int32)
        self._sc = np.ones(num_points)
        
        for x_val, y_val, index, in zip(xy[0], xy[1],range(num_points)):
            self._i1[index] = (x_val - o1) / d1 + 0.5
            self._i2[index] = (y_val - o2) / d2 + 0.5
            
            # Check for out-of-bounds indices
            if self._i1[index] < 0 or self._i2[index] < 0 or self._i1[index] >= n1 or self._i2[index] >= n2:
                self._i1[index] = 0
                self._i2[index] = 0
                self._sc[index] = 0

    def forward(self, add, mod, dat):
        """
        Forward operation.
        """
        self.checkDomainRange(mod, dat)
        if not add:
            dat.zero()
        for d_val, i1_val, i2_val, sc_val in zip(dat, self._i1, self._i2, self._sc):
            d_val += sc_val * mod[i2_val, i1_val]

    def adjoint(self, add, mod, dat):
        """
        Adjoint operation.
        """
        self.checkDomainRange(mod, dat)
        if not add:
            mod.zero()

        for i1_val, i2_val, sc_val, d_val in zip( self._i1, self._i2, self._sc, dat):
            mod[i2_val, i1_val] += sc_val * d_val


In [None]:
from sep_python import get_sep_vector
from sep_plot import Grey
import holoviews as hv
hv.extension('bokeh','matplotlib')
xy = np.stack((xar, yar))
avg=np.sum(zar)/zar.shape[0]
zar[:]=zar[:]-avg
zvec=get_sep_vector(zar)

data=get_sep_vector(reg_model)
binOp=Bin2D(data,zvec,xy)
ones=zvec.clone()
ones[:]=1
binOp.adjoint(False,data,zvec)
sc=data.clone()
binOp.adjoint(False,sc,ones)
for i2 in range(sc.shape[0]):
    for i1 in range(sc.shape[1]):
        if sc[i2,i1]!=0:
            data[i2,i1]/=sc[i2,i1]

#Grey(data)
d=data.get_nd_array()
print(zar.min(),zar.max(),d.min(),d.max())

## Our data fitting operator

Our data fitting operator will have 1s where we have known data 0 everywhere else

In [None]:
import copy
from generic_solver._pyOperator import Operator
class Jop(Operator):

    def __init__(self, mod, dat, not_data=0):
        """
        Initialize our selector operator
        """
        super().__init__(mod, dat)

        self._jop=dat.clone()

        hyper=model.get_hyper()
        
        ax0 = hyper.axes[0]
        n1, o1, d1 = ax0.n, ax0.o, ax0.d
        
        ax1 = hyper.axes[1]
        n2, o2, d2 = ax1.n, ax1.o, ax1.d

        for i2 in range(n2):
            for i1 in range(n1):
                if self._jop[i2,i1] == not_data:
                    self._jop[i2,i1]=0
                else:
                    self._jop[i2,i1]=1


    def forward(self, add, mod, dat):
        """
        Forward operation.
        """
        self.checkDomainRange(mod, dat)
        if not add:
            dat.zero()
        dat[:,:]=dat[:,:]+mod[:,:]*self._jop[:,:]


    def adjoint(self, add, mod, dat):
        """
        Adjoint operation.
        """
        self.checkDomainRange(mod, dat)
        if not add:
            mod.zero()
        mod[:,:]=mod[:,:]+dat[:,:]*self._jop[:,:]



## Regularization operator

Here is a simple laplacian operator

In [None]:
import copy
from generic_solver._pyOperator import Operator
from numba import njit
class Lap(Operator):

    def __init__(self, mod, dat):
        """
        Laplacian operator
        """
        super().__init__(mod, dat)

        self._jop=dat.clone()
        
        

    def forward(self, add, mod, dat):
        """
        Forward operation.
        """
        self.checkDomainRange(mod, dat)
        if not add:
            dat.zero()
        lap_forward(mod.get_nd_array(),dat.get_nd_array())


    def adjoint(self, add, mod, dat):
        """
        Adjoint operation.
        """
        self.checkDomainRange(mod, dat)
        if not add:
            mod.zero()
        lap_adjoint(mod.get_nd_array(),dat.get_nd_array())

@njit
def lap_forward(mod,dat):
    for i2 in range(1,dat.shape[0]-1):
        for i1 in range(1,dat.shape[1]-1):
            dat[i2,i1]+=mod[i2,i1]*4-mod[i2-1,i1]-mod[i2+1,i1]-mod[i2,i1-1]-mod[i2,i1+1]
        
        
@njit
def lap_adjoint(mod,dat):
    for i2 in range(1,dat.shape[0]-1):
        for i1 in range(1,dat.shape[1]-1):
            mod[i2,i1]+=dat[i2,i1]*4
            mod[i2,i1-1]-=dat[i2,i1]
            mod[i2,i1+1]-=dat[i2,i1]
            mod[i2+1,i1]-=dat[i2,i1]
            mod[i2-1,i1]-=dat[i2,i1]

     


## Setup our inversion problem

In [None]:
from generic_solver import ProblemL2LinearReg, BasicStopper
#from generic_solver._pyLinearSolver import LCGsolver
from generic_solver import LCGsolver

model=data.clone()
jop=Jop(model,data)
lap=Lap(model,data)
eps=.001
prob=ProblemL2LinearReg(model,data,jop,eps,reg_op=lap)
stop=BasicStopper(niter=500)
solve=LCGsolver(stop)
solve.run(prob)



## Plot

Take a look at the inverted model. It appears that we didn't accomplish much. Experiment with changing epsilon.

In [None]:
import holoviews as hv
hv.extension('bokeh','matplotlib')
Grey(prob.model)+Grey(data)

## Residuals 
Remember we have two residuals. The first is how well we've fit the data. Second, how much our model
fits are convariance definition.

In [None]:
Grey(prob.res.vecs[0])+Grey(prob.res.vecs[1])

## Improving our boundary condition

As discussed in class we are far from IID.  So lets fill in edge values using an inverse distance squared.

In [None]:
import holoviews as hv
import math
hv.extension('bokeh','matplotlib')
def find_val(x,y,xloc,yloc,z):
    wt=(x-xloc)*(x-xloc)+(y-yloc)*(y-yloc)+.001
    wt=wt*wt
   # wt=np.sqrt(wt)
    
    return np.sum(1./wt*z)/np.sum(1/wt)

for i2 in range(data.shape[0]):
    data[i2,0]=find_val(o1,o2+d2*i2,xar,yar,zar)
    data[i2,data.shape[1]-1]=find_val(o1+d1*data.shape[1]-1,o2+d2*i2,xar,yar,zar)
for i1 in range(data.shape[1]):
    data[0,i1]=find_val(o1+d1*i1,o2,yar,xar,zar)
    data[data.shape[0]-1,i1]=find_val(o1+d1*i1,o2+d2*data.shape[0]-1,xar,yar,zar)

Grey(data)


In [None]:
def correlation(xar,yar,zar,d_dist,n_dist):
    out=np.zeros((n_dist))
    npts=np.zeros((n_dist))
    avg=np.zeros((n_dist))
    zuse=zar-np.sum(zar)/len(zar)
    for x,y,z,isamp in zip(xar,yar,zuse,range(len(xar))):
        for x2,y2,z2 in zip(xar[isamp+1:],yar[isamp+1:],zuse[isamp+1:]):
            dist=math.sqrt((x2-x)*(x2-x)+(y2-y)*(y2-y))
            ipt=int(dist/d_dist+.5)
            if ipt < n_dist:
                npts[ipt]+=1
                avg[ipt]+=z*z+z2*z2
                out[ipt]+=z*z2
    for i in range(n_dist):
        if npts[i]>0:
            out[i]=out[i]/avg[i]
    return out  
                
zcor=correlation(x,y,z,600,50)
zcompare=np.zeros((4,50))
zcompare[0,:]=zcor[:]
zcompare[1,:]=1./np.linspace(300,300+600*50,50,endpoint=False)
zcompare[1,:]=zcompare[1,:]/zcompare[1,0]
zcompare[0,:]=zcompare[0,:]/zcompare[0,0]
zcompare[2,:]=(zcompare[1,:]*zcompare[1,:])
zcompare[3,:]=(zcompare[2,:]*zcompare[2,:])





In [None]:
from sep_plot import Graph
Graph(zcompare)

In [None]:
jop=Jop(model,data)
lap=Lap(model,data)
model.zero()
eps=.001
prob=ProblemL2LinearReg(model,data,jop,eps,reg_op=lap)
stop=BasicStopper(niter=500)
solve=LCGsolver(stop)
solve.setDefaults(save_grad=True)
solve.run(prob)


In [None]:
import holoviews as hv
hv.extension('bokeh','matplotlib')
Grey(prob.model)

In [None]:
import holoviews as hv
hv.extension('bokeh','matplotlib')
Grey(prob.res.vecs[0])+Grey(prob.res.vecs[1])

In [None]:
print(prob.get_model()[:3,:3])
