# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import os
import pickle

import utm

In [2]:
import prepostprocessing.cleaning as cleaning
import prepostprocessing.pre_processing as preproc

In [3]:
# Load jupyter extension to reload packages before executing user code.
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
# Reload all packages (except those excluded by %aimport) every time before executing the Python code typed.
%autoreload 2

## Mineralogy

In [None]:
mineralogy = pd.read_excel()

### Cleaning
* Replace zero values
* Normalize

### centred log-ratio (clr) transformation

### Principal Component Analysis (PCA)

____

## Coordinates

In [52]:
coordinates = pd.read_excel("../_DATA/full_with_coordinates.xlsx", index_col=0, usecols=[0, 1, 2, 3])

In [53]:
coordinates

Unnamed: 0,Latitude,Longitude,past_mer
0,"44°31'30.0""","138°37'30.0""",
1,"54°12'10.0""","119°24'0.0""",
2,"62°36'0.0""","155°36'0.0""",
3,"61°35'0.0""","146°2'0.0""",
4,"68°55'0.0""","164°24'0.0""",
...,...,...,...
4654,"66°42'0.0""","164°23'0.0""",
4655,"46°56'30.0""","137°5'3.0""",
4656,"58°12'0.0""","138°12'0.0""",
4657,"60°51'0.0""","147°31'0.0""",


In [83]:
# Delete negative signs in "Longitude" column for dms2dec function to work properly
coordinates["Longitude"] = coordinates["Longitude"].str.replace("-", "")

In [110]:
# Include W in "Longitude" column
sum_ = 0

for index, row in coordinates.iterrows():
    if ("W" in str(row["past_mer"])) or ("w" in str(row["past_mer"])):
        coordinates.loc[index, "Longitude"] = row["Longitude"] + "W"
        
        sum_ += 1

In [111]:
# Check that all occurences of "W" or "w" are catched
assert sum_ == int(coordinates["past_mer"].value_counts())

### Convert from degrees to decimal format

In [112]:
coordinates["Y"] = coordinates.loc[:, "Latitude"].apply(cleaning.dms2dec)
coordinates["X"] = coordinates.loc[:, "Longitude"].apply(cleaning.dms2dec)

In [114]:
# Check
coordinates.loc[42, "X"]

-178.83333333333334

### Convert to UTM coordinates

In [131]:
coordinates_utm = coordinates.apply(lambda row : utm.from_latlon(row["Y"], row["X"]), axis=1)

In [132]:
coordinates_utm = coordinates_utm.apply(pd.Series)

In [133]:
coordinates_utm.columns = ["X", "Y", "ZoneNumber", "ZoneLetter"]

In [134]:
coordinates_utm

Unnamed: 0,X,Y,ZoneNumber,ZoneLetter
0,311272.566098,4.932930e+06,54,T
1,656538.925786,6.008743e+06,50,U
2,633468.918083,6.943713e+06,56,V
3,448679.859182,6.828145e+06,55,V
4,475912.972942,7.645188e+06,58,W
...,...,...,...,...
4654,472781.240413,7.398072e+06,58,W
4655,658620.016371,5.200790e+06,53,T
4656,335447.154681,6.454395e+06,54,V
4657,528075.823175,6.746190e+06,55,V


In [136]:
coordinates_utm["ZoneNumber"].value_counts()

53    971
49    802
50    689
54    655
55    279
56    232
58    174
60    151
1     139
59    133
48    132
57    130
52     90
2      40
51     39
47      3
Name: ZoneNumber, dtype: int64

In [137]:
coordinates_utm["ZoneLetter"].value_counts()

U    1875
W    1117
T     878
V     758
X      31
Name: ZoneLetter, dtype: int64

____

## Metadata

In [62]:
metadata = pd.read_excel("../_DATA/full_with_coordinates.xlsx", index_col=0)\
             .drop(["Latitude", "Longitude", "past_mer"], axis=1)

In [63]:
metadata

Unnamed: 0,type_granite,time,massif,sampler,others,sampler+year
0,Granite leucogranitic,K2,,,,"V.N.Musin,1970"
1,Granite leucocratic,Tr,,V.I.Zhigalova,,"E.A.Ivanov,1969"
2,Granite leucocratic coarse-grained,K2,Omsukchan massif,P.M.Bosek,,"O.S.Gracheva,1948"
3,Granite leucocratic,K1,Buksandzhin massif,A.Kh.Brovtman,,"A.F.Mikhaylov,1948"
4,Granite-porphyry micropegmatitic,K1,Attykveem massif,L.G.Semenova,0th.:S-0.16,"A.I.Sadovsky,1963"
...,...,...,...,...,...,...
4654,Diorite,K1,Egdegkych massif,,Oth.:co2-0.12,"V.A.lgnat'ev,1964"
4655,Quartz diorite,K2,Verkhneplotnikovsky massif,,,"A.A.Syas'ko,1969"
4656,Diorite,J3,,,"Oth.:co2-0.02,so3-0.0l","N.N.Remizov,1967"
4657,Diorite,K2,,,,"A.P.Osipov,1966"


## Saving of data

In [None]:
# Save data as pickle files to use them in later notebooks
preproc.save_obj(mineralogy_clean) # mineralogy
preproc.save_obj() # mineralogy clr
preproc.save_obj() # mineralogy pca info
preproc.save_obj() # mineralogy pca scores
preproc.save_obj() # coordinates latlon
preproc.save_obj() # coordinates utm
preproc.save_obj() # metadata

____