In [68]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import csv
from datetime import date, datetime
import datetime
from tslearn.barycenters import dtw_barycenter_averaging
from tslearn.clustering import TimeSeriesKMeans
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FastICA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler
from scipy.spatial.distance import euclidean
from sklearn.feature_selection import f_regression
from statsmodels.tsa.stattools import adfuller
from sklearn.manifold import TSNE
from DBCV import DBCV
import hdbscan
import csv
import multiprocessing
import itertools
import operator
import math
from dtaidistance import dtw
import seaborn as sns
import elevation
import json
plt.style.use('fivethirtyeight')
from osgeo import gdal 
from subprocess import Popen
import simplekml
import copy
from sklearn.cluster import DBSCAN
import numpy.ma as ma

In [116]:
class Data():
    
    def __init__(self, data_path, filename1, filename2, pc=0.3):
        self.load(data_path, filename1, filename2, pc)
        
    def set_latitudes(self, latitudes):
        self.latitudes = latitudes
        
    def set_longitudes(self, longitudes):
        self.longitudes = longitudes
        
    def set_topo(self, topo):
        self.topo = topo
        
    def set_ns_mean_velocities(self, velocities):
        self.ns_mean_velocities = velocities
        
    def set_ew_mean_velocities(self, velocities):
        self.ew_mean_velocities = velocities
    
    def set_dates(self, dates):
        self.dates = dates
    
    def set_ns_displacements(self, ns_displacements):
        self.ns_displacements = ns_displacements
        
    def set_ew_displacements(self, ew_displacements):
        self.ew_displacements = ew_displacements
        
    def load(self,  data_path, filename1, filename2, pc):
        ns_displacements, ew_displacements, booleans = [], [], []
        ns_infos, ns = self.load_component(data_path, filename1)
        ew_infos, ew = self.load_component(data_path, filename2)
        m = len(ns[0])
        
        for n, components in enumerate(zip(ns, ew)):
            if ns[n].isnull().sum().sum() / m < pc:
                ns_displacements.append(components[0].interpolate(limit_direction='both', inplace=False)['displacement'].values)
                ew_displacements.append(components[1].interpolate(limit_direction='both', inplace=False)['displacement'].values)
                booleans.append(False)
            else:
                booleans.append(True)
            
        self.set_latitudes(ma.array(ns_infos['Lat'].values, mask = booleans).compressed())
        self.set_longitudes(ma.array(ns_infos['Lon'].values, mask = booleans).compressed())
        self.set_topo(ma.array(ns_infos['Topo'].values, mask = booleans).compressed())
        self.set_ns_mean_velocities(ma.array(ns_infos['Vel'].values, mask = booleans).compressed())
        self.set_ew_mean_velocities(ma.array(ew_infos['Vel'].values, mask = booleans).compressed())
        self.set_ns_displacements(np.array(ns_displacements))
        self.set_ew_displacements(np.array(ew_displacements))
        self.set_dates(ns[0].index)
        
    #np.count_nonzero(np.isnan(data))
    def load_image_correlation(self, data_path, ns_fi, ew_filename):
        df_ns, df_ns_ts = self.load_component(data_path, ns_filename)
        df_ew, df_ew_ts = self.load_component(data_path, ew_filename)
        df_ew.rename(columns={'Vel': 'Vel_ew'}, inplace=True)
        df_ns.rename(columns={'Vel': 'Vel_ns'}, inplace=True)
        geo = pd.concat([df_ew[['id', 'Lat','Lon','Topo','Vel_ew']], df_ns[['Vel_ns']]], axis=1)
        return geo, df_ns_ts, df_ew_ts
        
    def load_component(self, data_path, filename):
        
        # numéro de la ligne ou commence les données
        num_start = 44
        # numéro de la ligne ou se trouve la liste des dates
        num_list_dates = 40
        # attributs présent dans les données
        columns = ['id', 'Lat','Lon', 'Topo', 'Vel', 'Coer',' CosN', 'CosE', 'CosU']
        # dictionnaire stockant les données
        data = {column: [] for column in columns}
        # liste des dates 
        indexes = []
        # series temporelles
        series = []
        # liste de dataframes
        df_series = []

        with open(data_path + '/' + filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=",")
            line_count = 1 
            for row in csv_reader:
                if line_count == num_list_dates:
                    indexes = [row[0].split(' ')[1]] + row[1:]
                if line_count >= num_start:
                    # extraction des premiers attributs
                    for i in range(len(columns)):
                        data[columns[i]].append(row[i])
                    # extraction de l'attribut TS(série temporelle)
                    series.append([float(v) for v in row[len(columns):]])
                line_count  += 1
            if len(indexes) != len(series[0]):
                print('Erreur : Les indexes et les valeurs ne correspondent pas')
            # convertir les index en date
            indexes = [d.strip()[0:8] for d in indexes]
            # créer une liste de dataframes, chacun contenant une série temporelle
            for serie in series:
                tmp_serie = pd.DataFrame({'displacement': pd.Series(serie, index=pd.DatetimeIndex(indexes))})
                tmp_serie.sort_index(inplace=True)
                df_series.append(tmp_serie)
            # creer un dataframe pour les autres attributs
            df = pd.DataFrame(data)
            for column in df.columns:
                df[column] = pd.to_numeric(df[column], errors='coerce')
            df.set_index('id')
            
        return df, df_series

In [None]:
class TimeSerie():
    
    def __init__(self):
        pass
    
    def has_null_values(self, serie):
        return self.count_null_values(serie) > 0
    
    def count_null_values(self, serie):
        return serie.isnull().sum().sum()
    
    def compute_null_val_percentage(self, serie):
        return 100 *(1.0 * self.count_null_values(serie) / len(serie))
    
    def interpolate(self, serie):
        return serie.interpolate(limit_direction='both', inplace=False)
    
    def compute_pearson_coef(self, serie):
        return stats.pearsonr(np.squeeze(serie.values), get_days(serie.index))
    
    def compute_linear_reg_pval(self, serie):
        # extraire X et y
        X, y = self.prepare(serie)
        # calculer la p-value de la regression lineaire
        _, pval = f_regression(X,y.ravel())
        return  pval[0]
        
    def select(self, serie, filename, ref, min_slope, alpha, sigma, ampl, pc):
        slope   = self.get_slope_value(ref, serie, filename)
        p_value = self.get_linear_reg_pval(serie, alpha)
        vlm = self.vlm
        # filtrage des series avec peu de valeurs
        if self.compute_nul_val_percentage > pc:
            return False
        # filtrage des regressions non significatives
        if p_value > alpha:
            return False
        # filtrage des vitesses faibles
        if abs(vlm) < ampl* sigma:
            return False
         # filtrage des pentes faibles
        if slope < min_slope:
            return False
        # sauvegarder l'état du pixel
        self.set_selected()
        return True
    
    # la copie renvoie bien un nouvel objet, il n'y a pas d'effets de bord
    def smooth(self, s, ampl):
        serie = s.copy()
        std = math.sqrt(serie.var())
        for i in range(len(serie)):
            if abs(serie.iloc[i].displacement) > ampl*std:
                serie.iloc[i, serie.columns.get_loc('displacement')]= np.nan
        return serie.interpolate(limit_direction='both', inplace=False)
    
        
    def deepcopy(self, serie):
        clone = copy.deepcopy(self)
        clone.set_serie(serie)
        return clone
        
    def compute_adfuller(self, serie):
        adf_result = adfuller(serie)
        adf_output = pd.Series(adf_result[0:4],index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
        for key, value in adf_result[4].items():
            adf_output['Critical Value (%s)'%(key)] = value
        return adf_output[1]
    
    # ce test nécessite d'avoir des données regulièrement échantillonnées
    # Hyopthèse nulle : il existe une racine unitaire (série croissante ou cyclique)
    # Hypthèse alternative : il n'existe pas de racine unitaire (série stationnaire)
    # Si la p-valeur du test est inférireure à O.05, on rejette l'hypothèse nulle et la sériee est stationnaire
    # NB: on recherche des signaux non stationnaires 
    def is_stationary(self, serie, freq='D', alpha=0.05):
        resampled = serie.resample(freq)
        interpolated = upsampled.interpolate(method='linear')
        return self.compute_adfuller(interpolated) < alpha
    
    def get_days(self, serie):
        days = []
        dates = serie.index
        for i in range(len(dates)):
            days.append(abs((dates[0] - dates[i]).days ))
        return days
    
    def prepare(self, serie):
        # transformer les index en durée pour pouvoire effectuer une regression linéaire
        X = np.array([abs((serie.index[0] - serie.index[n]).days) for n in range(len(serie.index))]).reshape(-1,1)
        # extraire la cible
        y = StandardScaler().fit_transform(serie)
        return X, y
    
    # that functions gives approximately the same result when use sklearn linear regression
    # x and y and numpy array
    def compute_slope(self, serie):
        x, y = self.prepare(serie)
        return np.cov(x.T, y.T)[0][1] / np.var(x)
    
    def compute_inst_vel(self, serie):
        vels =  []
        for i in range(1, len(serie)-1):
            duration = (serie.index[i+1] - serie.index[i-1]).days
            displacement = serie.iloc[i+1].values[0] - serie.iloc[i-1].values[0]
            vels.append(displacement / duration)
        return pd.DataFrame(vels, index=serie.index[1:-1], columns=['vel'])
        
    def compute_diff_vect(self, serie):
        disp     = np.diff(np.squeeze(serie.values))
        duration = np.diff(np.squeeze(serie.index)) /  np.timedelta64(1,'D')
        return disp, np.cumsum(duration)

In [None]:
class TimeSerieProcessing():
    def __init__(self):
        
    def is_linear_reg_significant(self):
        pass

In [126]:
class ImageCorrelation():
    
    def __init__(sel, data, pc=0.4, alpha=0.05, ref='wgs84'):
        self.data = data
        self.alpha = alpha
        self.ref = ref
        self.pc = pc
        self.velocities = None
        self.mask = None
    
    def reshape(self):
        pass 
    
    def compute_std_velocities():
        pass 
    
    def compute_mean_velocity(self, n)
        ns_vel = self.data.ns_mean_velocities[n]
        ew_vel = self.data.ew_mean_velocities[n]
        return np.sqrt(ns_vel * ns_vel + ew_vel * ew_vel)
    
    def is_moving(self, n):
        pass
    
    def is_linear_regression_significant(self):
        pass
    
    def is_steep():
        pass
    
    def compute_slope(self):
        pass
    
    def is_to_select(self):
        pass
        
        
    def compute_velocities():
        pass
    
    def remove():
        pass
    
    def set_velocities(self):
        pass
    
    # amplitude, alpha, min_slope
    def set_filter_parameters(params):
        pass

SyntaxError: invalid syntax (<ipython-input-126-e30ad02c959b>, line 17)

In [8]:
class Clustering():
    
    def __init__(self, data, velocities, mask, option=0):
        self.data = data
        self.velocities = velocities
        self.mask = mask
        self.option = option
        
    # stocker les résultats du clustering
    def generate_kml_file(self):
        pass

In [98]:
DATA_PATH = './donnees' 
filename1  = 'MM_TIO_NS_31TGK_20151227_to_20200906.csv'
filename2  = 'MM_TIO_EW_31TGK_20151227_to_20200906.csv'

In [117]:
data = Data(DATA_PATH, filename1, filename2)

In [127]:
data.__dict__

{'latitudes': array([6.627922, 6.628048, 6.628173, ..., 6.662486, 6.662611, 6.662736]),
 'longitudes': array([44.422007, 44.422003, 44.421999, ..., 44.394165, 44.394161,
        44.394157]),
 'topo': array([1710.  , 1714.93, 1720.07, ..., 1280.52, 1277.93, 1275.  ]),
 'ns_mean_velocities': array([-0.0009, -0.0009, -0.001 , ..., -0.    , -0.    , -0.    ]),
 'ew_mean_velocities': array([-0.0003, -0.0003, -0.0003, ..., -0.0025, -0.0025, -0.0025]),
 'ns_displacements': array([[ 0.    , -1.4829, -1.1083, ..., -1.6207, -0.747 , -1.7681],
        [ 0.    , -1.5048, -1.135 , ..., -1.6455, -0.7691, -1.8022],
        [ 0.    , -1.471 , -1.1931, ..., -1.7787, -0.8391, -1.9032],
        ...,
        [ 0.    , -0.4363,  2.7005, ..., -3.4003,  4.2267, -3.3809],
        [ 0.    , -0.4398,  2.6997, ..., -3.4046,  4.2232, -3.3834],
        [ 0.    , -0.4432,  2.6989, ..., -3.4085,  4.2196, -3.3856]]),
 'ew_displacements': array([[ 0.0000e+00, -6.3500e-02, -6.6910e-01, ..., -4.0860e-01,
          2.020

In [179]:
duration = np.diff(data.dates) / np.timedelta64(1,'D')

In [207]:
datetime.now()

datetime.datetime(2021, 7, 1, 14, 54, 28, 780746)

In [240]:
duration = np.insert(np.cumsum( np.diff(data.dates) /  np.timedelta64(1,'D') ), 0, 0, axis=0)

In [241]:
duration

array([   0.,  120.,  180.,  220.,  230.,  240.,  250.,  270.,  420.,
        440.,  450.,  470.,  540.,  555.,  565.,  580.,  600.,  605.,
        620.,  650.,  655.,  660.,  670.,  690.,  695.,  700.,  710.,
        725.,  775.,  850.,  880.,  905.,  920.,  925.,  935.,  940.,
        945.,  970.,  975.,  990., 1005., 1010., 1030., 1055., 1080.,
       1100., 1105., 1115., 1120., 1125., 1145., 1155., 1190., 1220.,
       1235., 1250., 1255., 1270., 1280., 1285., 1300., 1310., 1315.,
       1320., 1330., 1355., 1360., 1370., 1380., 1465., 1470., 1505.,
       1510., 1520., 1540., 1560., 1565., 1570., 1580., 1585., 1590.,
       1640., 1670., 1685., 1705., 1710., 1715.])

In [262]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def is_mean_velocity_significant(displacements, days):
    # extraire X et y
    X = StandardScaler().fit_transform(displacements.reshape(-1,1))
    # calculer la p-value de la regression lineaire
    _, pval = f_regression(X, days)
    return  pval[0]

In [278]:
def test(x,y):
    return x > y
L = np.array([2,3,9,10,11])
test(L,y=8)

array([False, False,  True,  True,  True])

In [231]:
def get_days(index):
    days = []
    dates = index
    for i in range(len(dates)):
        days.append(abs((dates[0] - dates[i]).days ))
    return days

np.array(get_days(data.dates))

array([   0,  120,  180,  220,  230,  240,  250,  270,  420,  440,  450,
        470,  540,  555,  565,  580,  600,  605,  620,  650,  655,  660,
        670,  690,  695,  700,  710,  725,  775,  850,  880,  905,  920,
        925,  935,  940,  945,  970,  975,  990, 1005, 1010, 1030, 1055,
       1080, 1100, 1105, 1115, 1120, 1125, 1145, 1155, 1190, 1220, 1235,
       1250, 1255, 1270, 1280, 1285, 1300, 1310, 1315, 1320, 1330, 1355,
       1360, 1370, 1380, 1465, 1470, 1505, 1510, 1520, 1540, 1560, 1565,
       1570, 1580, 1585, 1590, 1640, 1670, 1685, 1705, 1710, 1715])

In [143]:
from sklearn.preprocessing import StandardScaler

In [149]:
def reshape_(ns_displacements, ew_displacements):
    data = []
    for n in range(len(ns_displacements)):
        data.append(np.vstack((ns_displacements[n], ns_displacements[n])).T)
    return np.array(data)


def apply_pca(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

In [148]:
D = reshape_(data.ns_displacements, data.ew_displacements)

In [154]:
def reshape2(data):
    n = len(data[0])
    output = []
    for d in data:
        output.append(StandardScaler().fit_transform(d).reshape(n))
    return output

In [157]:
data.ns_displacements.shape

(87016, 87)

In [158]:
x = PCA().fit_transform(data.ns_displacements)

array([[-9.87062760e+00,  9.04401016e+00,  1.51967271e+00, ...,
        -1.13453654e-01,  3.83985465e-02,  5.47749049e-13],
       [-1.00801857e+01,  9.00324727e+00,  1.59106049e+00, ...,
        -1.12404092e-01,  3.08430930e-02, -7.98116546e-17],
       [-1.05044376e+01,  8.74962331e+00,  2.03510869e+00, ...,
        -4.69711248e-02, -3.39395614e-03, -7.58570495e-17],
       ...,
       [ 5.01994326e-01,  3.38188066e-01, -1.62567715e+01, ...,
         2.51138342e-02,  2.78921216e-02,  1.14994409e-17],
       [ 4.71899665e-01,  3.12768450e-01, -1.62775561e+01, ...,
         2.43849140e-02,  2.65606308e-02,  1.14430892e-17],
       [ 4.42299979e-01,  2.87223222e-01, -1.62976321e+01, ...,
         2.37557255e-02,  2.54026535e-02,  1.13854924e-17]])

In [163]:
type(data.dates)

pandas.core.indexes.datetimes.DatetimeIndex