In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# read in the cleaned reflectance data
nir = pd.read_csv('data/protium_clean_reflectance_wide_appetizer.csv')
nir

Unnamed: 0,sample_id,replicate,nm350,nm351,nm352,nm353,nm354,nm355,nm356,nm357,...,nm2491,nm2492,nm2493,nm2494,nm2495,nm2496,nm2497,nm2498,nm2499,nm2500
0,1,0,0.095936,0.095160,0.096764,0.098158,0.096730,0.090671,0.085145,0.080683,...,0.285648,0.286222,0.286635,0.286392,0.285475,0.284107,0.283603,0.283534,0.284494,0.285283
1,1,1,0.100150,0.093596,0.093527,0.096819,0.097956,0.090741,0.089370,0.089669,...,0.293510,0.293770,0.293864,0.293190,0.292318,0.292997,0.293776,0.293843,0.294670,0.293290
2,1,2,0.099995,0.093401,0.094291,0.096501,0.092610,0.082529,0.083838,0.089398,...,0.276884,0.276790,0.276742,0.277109,0.277870,0.277979,0.277545,0.276995,0.277444,0.278641
3,1,3,0.099269,0.101386,0.098789,0.095361,0.095192,0.095691,0.093337,0.088454,...,0.295513,0.296246,0.296642,0.295863,0.295071,0.293896,0.293433,0.293477,0.293186,0.293818
4,1,4,0.099456,0.097670,0.100095,0.102767,0.101624,0.095701,0.092393,0.090349,...,0.292327,0.291852,0.292317,0.291674,0.291652,0.292501,0.292882,0.293344,0.292980,0.291234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,439,1,0.111290,0.110201,0.111329,0.111971,0.109915,0.103760,0.104118,0.106879,...,0.292971,0.292332,0.291154,0.291293,0.292642,0.293760,0.294280,0.294086,0.292911,0.292344
622,439,2,0.102713,0.106718,0.108143,0.104928,0.099531,0.099442,0.102752,0.101398,...,0.268913,0.269117,0.269263,0.268618,0.269190,0.270471,0.269897,0.269843,0.270209,0.268518
623,439,3,0.093962,0.096776,0.101298,0.102899,0.099576,0.092241,0.091143,0.093274,...,0.279936,0.280397,0.281067,0.280713,0.279246,0.278030,0.276957,0.277744,0.279022,0.279468
624,439,4,0.110332,0.108473,0.107579,0.110668,0.116691,0.105322,0.101672,0.102578,...,0.267410,0.267189,0.267890,0.268173,0.266904,0.265962,0.266012,0.266115,0.266023,0.266384


In [None]:
# read in the metadata on each sample
meta = pd.read_csv('data/subserratum_NIR_project.csv')
meta

Unnamed: 0,ViewSpecFile Folder,DataFolder,Sample number,scanning notes,collector and no,species,population/lineage,geographic location,Unnamed: 8,GPS?,included in JBio?,included in Misiewicz et al. 2023?,included by Ben Sims,damasco DDRAD
0,Protium1,"""March20""",,,,subserratum,A/C,,,,,,,
1,Protium2,"""March20""",,,,subserratum,A/C,,,,,,,
2,Protium3,"""March20""",,,,subserratum,A/C,,,,,,,
3,Protium4,"""March20""",,,,subserratum,A/C,,,,,,,
4,Protium5,"""March20""",,,,subserratum,A/C,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,Protium432,"""May7""",3754435,,Sanchez 1398,subserratum,morphotype 4,Colombia: Araracuara,,,,,,
399,Protium321,"""May7""",3754436,,Londono 908,subserratum,morphotype 4,Colombia: Araracuara,,,,,,
400,Protium429,"""May7""",3754437,,Londono 1407,subserratum,morphotype 4,Colombia: Araracuara,,,,,,
401,Protium419,"""May7""",3754438,,Sanchez 1660,subserratum,morphotype 4,Colombia: Araracuara,,,,,,


In [None]:
meta.columns

Index(['ViewSpecFile Folder', 'DataFolder', 'Sample number', 'scanning notes',
       'collector and no', 'species', 'population/lineage',
       'geographic location', 'Unnamed: 8', 'GPS?', 'included in JBio?',
       'included in Misiewicz et al. 2023?', 'included by Ben Sims',
       'damasco DDRAD'],
      dtype='object')

In [None]:
species = meta[['ViewSpecFile Folder', 'species']]
species.columns = ['sample_raw', 'species']
species

Unnamed: 0,sample_raw,species
0,Protium1,subserratum
1,Protium2,subserratum
2,Protium3,subserratum
3,Protium4,subserratum
4,Protium5,subserratum
...,...,...
398,Protium432,subserratum
399,Protium321,subserratum
400,Protium429,subserratum
401,Protium419,subserratum


In [None]:
# separate out sample and replicate ids
species['sample_id'] = species['sample_raw'].str.split('Protium', expand = True)[1]
species['sample_id'] = pd.to_numeric(species['sample_id'])
species = species.drop('sample_raw', axis = 1)
print(species.dtypes)
species

species      object
sample_id     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  species['sample_id'] = species['sample_raw'].str.split('Protium', expand = True)[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  species['sample_id'] = pd.to_numeric(species['sample_id'])


Unnamed: 0,species,sample_id
0,subserratum,1
1,subserratum,2
2,subserratum,3
3,subserratum,4
4,subserratum,5
...,...,...
398,subserratum,432
399,subserratum,321
400,subserratum,429
401,subserratum,419


In [None]:
# get average wavelength per sample
nir_mean = nir.drop(['replicate'], axis = 1).groupby('sample_id').mean()
nir_mean.reset_index(inplace = True, col_level = 1)

# merge on species id
nir_mean = nir_mean.merge(species, how = 'left', on = 'sample_id')

# drop the 'mystery' sample for now
nir_mean = nir_mean.loc[nir_mean['sample_id'] != 439,:]

nir_mean

Unnamed: 0,sample_id,nm350,nm351,nm352,nm353,nm354,nm355,nm356,nm357,nm358,...,nm2492,nm2493,nm2494,nm2495,nm2496,nm2497,nm2498,nm2499,nm2500,species
0,1,0.098187,0.096495,0.095768,0.095777,0.095182,0.089563,0.087302,0.086609,0.082278,...,0.29241,0.292447,0.291987,0.291596,0.291617,0.291676,0.291825,0.292225,0.291901,subserratum
1,2,0.080885,0.07979,0.081745,0.08368,0.082312,0.076836,0.074753,0.074398,0.070823,...,0.309495,0.309405,0.309154,0.308365,0.308007,0.308309,0.308463,0.308815,0.30874,subserratum
2,3,0.101479,0.099994,0.098727,0.09807,0.09729,0.091149,0.088376,0.088203,0.086045,...,0.296007,0.296117,0.295819,0.29537,0.295241,0.295227,0.295498,0.295827,0.295372,subserratum
3,4,0.084137,0.083626,0.083653,0.08473,0.08565,0.079651,0.076799,0.076525,0.073969,...,0.292284,0.292181,0.291659,0.291117,0.291104,0.291334,0.29172,0.292151,0.292156,subserratum
4,5,0.097624,0.095601,0.09652,0.097822,0.096242,0.088954,0.087053,0.087874,0.083781,...,0.320874,0.320617,0.320207,0.319789,0.31964,0.319802,0.319994,0.320379,0.320633,subserratum
5,6,0.082617,0.08124,0.082334,0.083942,0.083446,0.078302,0.075403,0.074518,0.072534,...,0.300406,0.300001,0.299552,0.299552,0.2996,0.299883,0.30005,0.299996,0.299548,subserratum
6,7,0.073839,0.074997,0.077841,0.079592,0.078133,0.072633,0.070452,0.070313,0.067748,...,0.265384,0.265283,0.265001,0.264479,0.264282,0.264291,0.264397,0.264486,0.26431,subserratum
7,8,0.076696,0.076666,0.077288,0.077907,0.077661,0.07412,0.072554,0.07113,0.066119,...,0.267213,0.266931,0.266821,0.266464,0.2662,0.266059,0.265888,0.265984,0.266085,subserratum
8,9,0.093115,0.091907,0.093141,0.093513,0.090418,0.085505,0.084047,0.083957,0.080714,...,0.311662,0.31156,0.311207,0.310612,0.310509,0.310655,0.310719,0.310937,0.31089,subserratum
9,10,0.087874,0.086514,0.086991,0.087604,0.085902,0.079177,0.077757,0.078978,0.075364,...,0.268537,0.268736,0.268256,0.267796,0.267529,0.26724,0.267142,0.267337,0.267511,subserratum


In [None]:
# set up train and test data for the LDA
X = nir_mean.copy()

X.index = X['sample_id']

X


Unnamed: 0_level_0,sample_id,nm350,nm351,nm352,nm353,nm354,nm355,nm356,nm357,nm358,...,nm2492,nm2493,nm2494,nm2495,nm2496,nm2497,nm2498,nm2499,nm2500,species
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0.098187,0.096495,0.095768,0.095777,0.095182,0.089563,0.087302,0.086609,0.082278,...,0.29241,0.292447,0.291987,0.291596,0.291617,0.291676,0.291825,0.292225,0.291901,subserratum
2,2,0.080885,0.07979,0.081745,0.08368,0.082312,0.076836,0.074753,0.074398,0.070823,...,0.309495,0.309405,0.309154,0.308365,0.308007,0.308309,0.308463,0.308815,0.30874,subserratum
3,3,0.101479,0.099994,0.098727,0.09807,0.09729,0.091149,0.088376,0.088203,0.086045,...,0.296007,0.296117,0.295819,0.29537,0.295241,0.295227,0.295498,0.295827,0.295372,subserratum
4,4,0.084137,0.083626,0.083653,0.08473,0.08565,0.079651,0.076799,0.076525,0.073969,...,0.292284,0.292181,0.291659,0.291117,0.291104,0.291334,0.29172,0.292151,0.292156,subserratum
5,5,0.097624,0.095601,0.09652,0.097822,0.096242,0.088954,0.087053,0.087874,0.083781,...,0.320874,0.320617,0.320207,0.319789,0.31964,0.319802,0.319994,0.320379,0.320633,subserratum
6,6,0.082617,0.08124,0.082334,0.083942,0.083446,0.078302,0.075403,0.074518,0.072534,...,0.300406,0.300001,0.299552,0.299552,0.2996,0.299883,0.30005,0.299996,0.299548,subserratum
7,7,0.073839,0.074997,0.077841,0.079592,0.078133,0.072633,0.070452,0.070313,0.067748,...,0.265384,0.265283,0.265001,0.264479,0.264282,0.264291,0.264397,0.264486,0.26431,subserratum
8,8,0.076696,0.076666,0.077288,0.077907,0.077661,0.07412,0.072554,0.07113,0.066119,...,0.267213,0.266931,0.266821,0.266464,0.2662,0.266059,0.265888,0.265984,0.266085,subserratum
9,9,0.093115,0.091907,0.093141,0.093513,0.090418,0.085505,0.084047,0.083957,0.080714,...,0.311662,0.31156,0.311207,0.310612,0.310509,0.310655,0.310719,0.310937,0.31089,subserratum
10,10,0.087874,0.086514,0.086991,0.087604,0.085902,0.079177,0.077757,0.078978,0.075364,...,0.268537,0.268736,0.268256,0.267796,0.267529,0.26724,0.267142,0.267337,0.267511,subserratum
