In [1]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
import itertools
from scipy import stats

In [None]:
from statsmodels.stats.diagnostic import lillifors

In [None]:
data = pd.read_excel("/Users/FeatureForest/Documents/Spektra list gatunków Fritillaria.xlsx", sheet_name='DANE KWIATÓW', delimiter=";", decimal=",")

In [None]:
data.head()

In [None]:
#retruns pollinator
def pollinator(name):
    if((name == 'F. eduardii') | (name == 'F. imperialis')):
        return "PAS"
    elif ((name == "F. recurva") | (name == "F. gentneri")):
        return "HUM"
    else:
        return "INS"

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.Gatunek.unique()

In [None]:
#data["płatek"] = data.płatek.astype(float)


In [None]:
data.groupby(['Gatunek']).agg({'płatek': 'count'
                                }).reset_index()

In [None]:
df1 = data.groupby(['Gatunek']).agg({'Średnica': 'mean', 
                                     'płatek': 'mean'
                                }).reset_index()


In [None]:
df1.dropna(inplace=True)

In [None]:
df1['Pollinator'] = df1['Gatunek'].apply(pollinator)

In [None]:
df1

In [None]:
df1.info()

In [None]:
ins_srednica = df1.Średnica[df1.Pollinator == "INS"]
pas_srednica = df1.Średnica[df1.Pollinator == "PAS"]
hum_srednica = df1.Średnica[df1.Pollinator == "HUM"]

### Testing normality srednica

In [None]:
pVals = pd.Series()
# The scipy normaltest is based on D-Agostino and Pearsons test that
# combines skew and kurtosis to produce an omnibus test of normality.
_, pVals['Omnibus']    = stats.normaltest(ins_srednica)

# Shapiro-Wilk test
_, pVals['Shapiro-Wilk']    = stats.shapiro(ins_srednica)

# Or you can check for normality with Lilliefors-test
_, pVals['Lilliefors']    = lillifors(ins_srednica)

# Alternatively with original Kolmogorov-Smirnov test
_, pVals['Kolmogorov-Smirnov']    = stats.kstest((ins_srednica-np.mean(ins_srednica))/np.std(ins_srednica,ddof=1), 'norm')

print('p-values for all {0} data points: ----------------'.format(len(ins_srednica)))
print(pVals)

if pVals['Omnibus'] > 0.05:
    print('Data are normally distributed')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import (pairwise_tukeyhsd, MultiComparison)
from statsmodels.stats.libqsturng import psturng

In [None]:
df1.boxplot('Średnica', by='Pollinator')

In [None]:
# First, check if the variances are equal, with the "Levene"-test
(W,p) = stats.levene(ins_srednica, pas_srednica, hum_srednica)
if p<0.05:
    print(('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p)))
else:
    
    print('OK', p)

In [None]:
f, p = stats.f_oneway(ins_srednica,
                      hum_srednica,
                      pas_srednica)
 
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

In [None]:
#Then, do the multiple testing
multiComp = MultiComparison(df1['Średnica'], df1['Pollinator'])

In [None]:
print((multiComp.tukeyhsd().summary()))

In [None]:
ins_płatek = df1.płatek[df1.Pollinator == "INS"]
pas_płatek = df1.płatek[df1.Pollinator == "PAS"]
hum_płatek = df1.płatek[df1.Pollinator == "HUM"]

In [None]:
# First, check if the variances are equal, with the "Levene"-test
(W,p) = stats.levene(ins_płatek, pas_płatek, hum_płatek)
if p<0.05:
    print(('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p)))
else:
    
    print('OK', p)

In [None]:
f, p = stats.f_oneway(ins_płatek,
                      hum_płatek,
                      pas_płatek)
 
print ('One-way ANOVA')
print ('=============')
 
print ('F value:', f)
print ('P value:', p, '\n')

In [None]:
#Then, do the multiple testing
multiComp = MultiComparison(df1['płatek'], df1['Pollinator'])

In [None]:
print((multiComp.tukeyhsd().summary()))

In [None]:
df1[df1.Pollinator=='PAS']

In [None]:
#Create a boxplot
df1.boxplot('płatek', by='Pollinator', figsize=(12, 8))

In [None]:
grub = pd.read_excel("/Users/FeatureForest/Documents/Spektra list gatunków Fritillaria.xlsx", sheet_name='grubość płatka', delimiter=";", decimal=",")

In [None]:
grub.drop(grub.columns[1:23], axis=1, inplace=True)

In [None]:
grub.info()

In [None]:
grub.rename(index=str, columns={"μm": "grub", "Unnamed: 25": "N"}, inplace=True)

In [None]:
grub.reset_index(inplace=True)

In [None]:
grub.drop(grub.columns[0], axis=1, inplace=True)

In [None]:
grub.head()

In [None]:
xls = pd.ExcelFile('/Users/FeatureForest/Documents/Fritillaria Warsaw.xlsx')

In [None]:
names = xls.sheet_names

In [None]:
#list of analysed species
species = ["F. imperialis","F. michailovskyi",'F. whittallii','F. tubiformis','F. gracilis',
           'F. eduardii','F. minima','F. thunbergii','F. eastwoodiae', "F. liliaceae", 
           "F. liliaceae", "F. kurdica",'F. pyrenaica', 'F. pontica', 'F. thunbergii',
           'F. uva vulpis', 'F. affinis','F. montana',  'F. aryiana', 'F. dasyphylla', 
           'F. verticillata','F. serpenticola', 'F. mutabilis', 'F. raddeana', 'F. graeca', 
           'F. caucasica', 'F. gibbosa', 'F. armena','F. stenanthera','F. ussuriensis',
           'F. kotschyana','F. meleagris', "F. michailovskyi", 'F. latakesis', 'F. gentneri',
          'F. recurva', 'F. sewerzowii']

In [None]:
import re

In [None]:
outside = []
pattern = ".+ou.+"
outside = [x for x in names if re.match(pattern,x)]

In [None]:
#removing reflectance in other than red
indexes = [1,3,15,17,20,35,39,40]
for index in sorted(indexes, reverse=True):
    del outside[index]

In [None]:
#outside

In [None]:
indexes_outside = []
for i in range(len(outside)):
    indexes_outside.append(names.index(outside[i]))

sheets_outside = []
for i in range(len(outside)):
    sheet = xls.parse(indexes_outside[i])
    sheets_outside.append(sheet)

In [None]:
#returns UV & visible spectrum
for i in range(len(sheets_outside)):
    sheets_outside[i] = sheets_outside[i][(sheets_outside[i].nm > 300) & (sheets_outside[i].nm < 700)]

In [None]:
#columns wiht reflectancy value
columns_ref = sheets_outside[0].columns[1::2]

In [None]:
#returns mean of wavelengths (relative)
for i in range(len(sheets_outside)):
    sheets_outside[i] = sheets_outside[i].assign(mean=sheets_outside[i][columns_ref].mean(axis=1)/1000)

In [None]:
from functools import reduce

In [None]:
numbers = []
for i in range(1,38):
    numbers.append("Sp"+str(i))
nms = list(zip(numbers, species))
outside_species = []
for i in range(9):
    outside_species.append(reduce(lambda a, kv: a.replace(*kv), nms, outside[i])) 

In [None]:
for i in range(9, len(outside)):
    outside_species.append(reduce(lambda a, kv: a.replace(*kv), nms[9:], outside[i])) 

In [None]:
#sheets_outside[0]

In [None]:
#(outside_species)

In [None]:
len(species)

In [None]:
srednia = []
for i in range(len(sheets_outside)):
    srednia.append(sheets_outside[i]['mean'].mean())

In [None]:
len(srednia)

In [None]:
#srednia

In [None]:
reflektancja = pd.DataFrame(
    {'Gatunek': species,
     'srednia': srednia
    })

In [None]:
#reflektancja

In [None]:
grub

In [None]:
df = pd.merge(reflektancja, grub, how='inner', left_on='Gatunek', right_on='sp')

In [None]:
df

In [None]:
df['srednia'].corr(df['grub'])

In [None]:
#retruns pollinator
def pollinator(name):
    if((name == 'F. eduardii') | (name == 'F. imperialis')):
        return "PAS"
    elif ((name == "F. recurva") | (name == "F. gentneri")):
        return "HUM"
    else:
        return "INS"

In [None]:
df['Pollinator'] = df['Gatunek'].apply(pollinator)

In [None]:
df

In [None]:
ins = df[df.Pollinator == "INS"]

In [None]:
ins['srednia'].corr(ins['grub'])

In [None]:
hum = df[df.Pollinator == "HUM"]

In [None]:
hum['srednia'].corr(hum['grub'])

In [None]:
pas = df[df.Pollinator == "PAS"]
pas['srednia'].corr(pas['grub'])

In [None]:
df = pd.merge(df1, grub, how='inner', left_on='Gatunek', right_on='sp')

In [None]:
df1.Gatunek.unique()

In [None]:
grub.sp.unique()

In [None]:
df

In [None]:
col = {'INS':'#FFE125', 'PAS':'#007800', 'HUM':'#bf0000'}
df["colors"] = df.Pollinator.apply(lambda x: col[x])

In [None]:
labels =[]
sns.set_style("white")
for key, value in col.items():
    labels.append(key)
    plt.scatter(x = df.Średnica[df.Pollinator == key], y = df.płatek[df.Pollinator == key],
            s = np.array(df.grub[df.Pollinator == key])/2, 
            c = df.colors[df.Pollinator == key], alpha=0.7, edgecolors='black')

#plt.yscale('log')
plt.xscale('log') 
plt.xlabel('Petal diameter (log scale)')
plt.ylabel('Petal length')
plt.title('Petal length vs petals diameter', fontsize=18)
#plt.xlim(0,350)

#plt.xticks([1000,10000,100000], ['1k','10k','100k'])
#plt.text(120, 24, 'F.imperialis')
#plt.text(70, 4, 'F.eduardi')
lgnd = plt.legend(labels, title='Color by pollinator')
for i in range(len(col)):
    lgnd.legendHandles[i]._sizes = [60]

plt.text(11.8,40, "Size by density")
plt.scatter(14, 37, s=100, c = 'white', edgecolors='k')
plt.text(15.2, 36,  r'$50 \mu g$')
# Show the plot
plt.show()