In [None]:
from build_db import *
from classifications import *
#from skrebate import ReliefF
#from skrebate import SURFstar
#from skrebate import MultiSURFstar
from create_plots import *
import datetime
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [None]:
path = fr'your_data_path_here_range_1.csv'
df_multiple = get_acquisitions_in_plateau(path,range_idx=1,tolerance=0.75,spectra_to_keep=4)
features = list(df_multiple.columns.drop(['acq','index']))
df_multiple[features] = df_multiple[features].apply(normalize_spectrum_sum,axis=1)
df_filtered = apply_filters(df_multiple,features)
df_all = df_filtered
features_all = list(features)
df_all.rename(columns={'acq':f'acq_{1}'},inplace=True)
for i in range(2,5):
    path = fr'your_data_path_here_range_{i}.csv'
    df_multiple = get_acquisitions_in_plateau(path,range_idx=i,tolerance=0.75,spectra_to_keep=4)
    features = list(df_multiple.columns.drop(['acq','index']))
    df_multiple.loc[:,features] = df_multiple[features].apply(normalize_spectrum_sum,axis=1)
    df_filtered = apply_filters(df_multiple,features)
    #df_filtered = df_multiple
    features_all = list(set(features_all).union(set(features)))
    df_filtered.rename(columns={'acq':f'acq_{i}'},inplace=True)
    df_all = df_all.merge(df_filtered,on='index',suffixes=(f'', f'_bis')).reset_index(drop=True)
features_all.sort(key=float)

In [None]:
df_all[[x for x in df_all.columns if 'acq' in x]]

In [None]:
labels_path = r'your_labels_here.csv'
df_labels = get_labels(labels_path,range_=2)

In [None]:
df_merged = df_all.merge(df_labels,on='index').reset_index(drop=True)
outliers = ["your_index_outliers"]
df_merged = df_merged[~df_merged['index'].isin(outliers)]

In [None]:
df_merged['index'].unique().shape

In [None]:
plot_multiple_spectra(df_merged.groupby('index').mean(numeric_only=True).reset_index(),features_all,'Covid',title='Spectra normalized and filtered')

In [None]:
df_filtered = pd.DataFrame(df_merged.loc[:,features_all].apply(normalize_spectrum_sum,axis=1),columns=features_all,index=df_merged.index)
new_features = df_filtered.loc[:, (df_filtered != 0).any(axis=0)].columns
other_columns = df_merged.columns.drop(df_filtered.columns)
df_filtered[other_columns] = df_merged[other_columns]

In [None]:
plot_multiple_spectra(df_filtered.groupby('index').mean(numeric_only=True).reset_index(),features_all,'Covid',title='Spectra normalized and filtered')

In [None]:
df_filtered['index'].unique().shape

In [None]:
df_filtered.shape

In [None]:
plot_transformer(df_merged, features,transformer=TSNE(random_state=0, n_components=2),index='index',color='Covid',additional_text = '',pre_processer=Pipeline([           
    ('prep',RobustScaler()),     
    ]))

In [None]:
len(features_all)

In [None]:
vr = VarianceThreshold()
vr.fit(df_filtered[features_all])
new_features = list(vr.get_feature_names_out())
print(len(new_features))

X = df_filtered[new_features+['index']]
y = df_filtered[['Covid','index']]

#X = X.groupby('index').mean().reset_index()
#y = y.groupby('index').min().reset_index()

clf1 = KNeighborsClassifier(n_neighbors=5,metric='minkowski')
clf2 = RandomForestClassifier(random_state=1)
clf3 = LogisticRegression()
clf4 = GradientBoostingClassifier(min_samples_leaf=2,n_estimators= 500, max_depth = 3, learning_rate=0.1,random_state=1)
clf5 = SVC(gamma='auto',C=1,probability=True,random_state = 1)
eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('gnb', clf3), ('gbc', clf4), ('svc', clf5)],
                            voting='soft')
clf_dict = {}
for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf],['KNN', 'Random Forest', 'Logistic Regression', 'Gradient Boosting', 'SVC', 'Ensemble']):
        clf_dict[label] = Pipeline([
                ('std',RobustScaler()),
                #('feature_selection',MultiSURFstar(n_features_to_select=100,n_jobs=-1)),
                ('pca',PCA(20)),
                ('clf',clf)
        
        ])


In [None]:
import contextlib
output_filename = 'PERFORMANCES 1734 FEATURES'

std_out_path = f'./{output_filename}.out'
std_err_path = f'./{output_filename}.err'

with open(std_err_path, 'w') as ferr:
    with contextlib.redirect_stderr(ferr):
        with open(std_out_path, 'w') as fout:
            with contextlib.redirect_stdout(fout):
                results,list_df = train_test(clf_dict,X,y,new_features,return_train_score=False,cv=StratifiedGroupKFold(n_splits=10, random_state=1, shuffle=True))
                print_scores(results)

In [None]:
print_scores(results)

In [None]:
from joblib import dump, load
for key,model in clf_dict.items():
    model.fit(X[new_features],y['Covid'])
    dump(model, f'./MODELS/{key}_all_patients.joblib')
    
with open('./MODELS/USED_FEATURES.txt','w') as fout:
    for item in new_features:
        fout.write(f'{item};')

In [None]:
with open('./MODELS/USED_FEATURES.txt','r') as fin:
    used_features = fin.read().rstrip(';').split(';')

In [None]:
param_dist1 = {'n_neighbors': randint(1, 10), 'metric': ['manhattan', 'minkowski']}
param_dist2 = {'n_estimators': randint(10, 1000), 'max_features': ['sqrt', 'log2'],'max_depth': randint(2, 10),'min_samples_leaf':[1,2,3,10]}
param_dist3 = {'C': uniform(loc=0, scale=4), 'penalty': ['l1', 'l2', 'elasticnet', None]}
param_dist4 = {'n_estimators': randint(10, 1000), 'max_depth': randint(2, 10), 'learning_rate': uniform(0.01, 0.5),'min_samples_leaf':[1,2,3,10]}
param_dist5 = {'C': uniform(loc=0, scale=4), 'gamma': ['scale', 'auto'], 'kernel': ['rbf']}
clf_dict_CV = {}
# Create classifier instances
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(random_state=1)
clf3 = LogisticRegression()
clf4 = GradientBoostingClassifier(min_samples_leaf=2, random_state=1)
clf5 = SVC(probability=True)
eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('gnb', clf3), ('gbc', clf4), ('svc', clf5)],
                            voting='soft')
# Create a dictionary of classifiers and their corresponding parameter distributions
classifiers = {'KNN CV': (clf1, param_dist1), 'Random Forest CV': (clf2, param_dist2), 'Logistic Regression CV': (clf3, param_dist3), 'Gradient Boosting CV': (clf4, param_dist4), 'Support Vector Machine CV': (clf5, param_dist5)}
#clf_dict = {}
# Run hyperparameter tuning for each classifier using RandomizedSearchCV
for name, (clf, param_dist) in classifiers.items():
    #print(f'Tuning hyperparameters for {name}...')
    rs = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=50, cv=5, random_state=1, n_jobs=-1)
    clf_dict_CV[name] = Pipeline([
                ('std',RobustScaler()),
                ('pca',PCA(20)),
                ('clf',rs)
        
        ])

clf_dict_CV['Ensemble CV'] = Pipeline([
                ('std',RobustScaler()),
                ('pca',PCA(20)),
                ('clf',eclf)
        
        ])
    #print(f'Best hyperparameters: {rs.best_params_}')
    #print(f'Training score: {rs.best_score_}')
    #print(f'Test score: {rs.score(X_test, y_test)}')

In [None]:
import contextlib
filename ='./RESULTS/BASIC_MODELS.txt'
with open(filename,'w') as fout:
    with contextlib.redirect_stdout(fout):
        print_scores(results)

In [None]:
filename ='./RESULTS/CV_MODELS.txt'
with open(filename,'w') as fout:
    with contextlib.redirect_stdout(fout):
        results_2,list_df_2 = train_test(clf_dict_CV,X,y,new_features,return_train_score=False,cv=StratifiedGroupKFold(n_splits=10, random_state=1, shuffle=True))
        print_scores(results_2)

In [None]:
from joblib import dump, load
for key,model in clf_dict.items():
    model.fit(X[new_features],y['Covid'])
    dump(model, f'./MODELS/{key}_cv_all_patients.joblib')