# Projeto 1 - Modelo de Classificação

### Projeto realizado por :

#### 
* Cosmin Trandafir - 57101
* Martim Baptista - 56323
* João Serafim - 56376
* Martim Paraíba - 56273
***

#### Counter de horas I guess :{}

* Cosmin Trandafir - 4h
* Martim Baptista - 
* João Serafim - 3h
* Martim Paraíba - 
***

Questoes: 
- Devemos apagar colunas que possuem demasiados valores a zero?
- Devemos normalizar primeiro ou inputar primeiro?
- Devemos testar e mostrar os testes de todos os hyperparametros ou apenas os principais?

- Devemos testar com IterativeImputer?

### Neste projeto vamos usar o dataset: ***biodegradable_a.cvs*** 

In [None]:
import pandas as pd

#Load biodegradable dataset

bio_df = pd.read_csv("biodegradable_a.csv")
bio_df

## Generic functions and Imports

In [None]:
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PowerTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [None]:
def classification_scores(y_test, y_pred):
    # Evaluate the performance of the model using various metrics
    print("The Precision is: %7.4f" % precision_score(y_test, y_pred))
    print("The Recall is: %7.4f" % recall_score(y_test, y_pred))
    print("The F1 score is: %7.4f" % f1_score(y_test, y_pred))
    print("The Matthews correlation coefficient is: %7.4f" % matthews_corrcoef(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

def evaluate_model(imputer_tuple, scaler_tuple, classifier, X_train, X_test, y_train):
    imputer = imputer_tuple[1]
    # Impute missing values
    imputer.fit(X_train)
    X_train_imputed = imputer.transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    scaler = scaler_tuple[1]
    # Scale the test data
    scaler.fit(X_train_imputed)
    X_train_scaled = scaler.transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)
    
    # Train the classifier
    classifier.fit(X_train_scaled, y_train)
    preds = classifier.predict(X_test_scaled)
    
    # Compute evaluation metrics
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    
    # Compute cross-validation scores
    cv_scores = cross_val_score(classifier, X_test_scaled, y_test, cv=10)
    mean_cv_score = cv_scores.mean()
    
    print("Imputer: {} \nScaler: {}\nCrossValidationScore: {:.6f}\n".format(imputer_tuple[0], scaler_tuple[0], mean_cv_score))
    return mean_cv_score


    """  # Create histogram
    labels = ['Precision', 'Recall', 'F1 Score', 'MCC', 'Mean CV Score']
    scores = [precision, recall, f1, mcc, mean_cv_score]
    plt.bar(labels, scores)
    plt.title('Model Evaluation Metrics')
    plt.ylim([0.7, 1.0])
    plt.grid(axis='y')
    plt.show() """


## Inicialization 

In [None]:
# Divide Freatures and Class columns for preprocessing
X = bio_df.iloc[:, :-1]
y = bio_df.iloc[:, -1]

# Encode string classes to a numeric value for Imputer
le = preprocessing.LabelEncoder()
df_encoded_classes = le.fit_transform(y)

## Data Imputation


***
### Simple Imputer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, df_encoded_classes, test_size=0.33)

imputers =[("SimpleImputer = mean", SimpleImputer(strategy='mean')), 
           ("SimpleImputer = median", SimpleImputer(strategy='median')), 
           ("KNNImputer 3",KNNImputer(n_neighbors=3)), 
           ("KNNImputer 5",KNNImputer(n_neighbors=5)), 
           ("KNNImputer 9",KNNImputer(n_neighbors=9)),
           ("KNNImputer 11",KNNImputer(n_neighbors=11)),
           ("IterativeImputer",IterativeImputer())]

scalers = [("MinMaxScaler",MinMaxScaler()), 
           ("StandarScaler",StandardScaler()), 
           ("Normalizer",Normalizer()), 
           ("PowerTransformer",PowerTransformer())]

combos = [imputer_name + " + " + scaler_name for imputer_name, imputer in imputers for scaler_name, scaler in scalers]

scores = []
for imputer in imputers:
    for scaler in scalers:
        scores.append(evaluate_model(imputer, scaler, LogisticRegression(max_iter=10000), X_train, X_test, y_train))




In [None]:
fig, ax = plt.subplots(figsize=(20, 13))
ax.bar(combos, scores)
ax.set_title('Model Evaluation Metrics')
ax.set_ylim([0.8, 1.0])
ax.grid(axis='y')
ax.set_xticklabels(combos, rotation=90, ha='center')

max_score_idx = np.argmax(scores)
highest_name, highest_score = combos[max_score_idx], scores[max_score_idx]
ax.axhline(y=scores[max_score_idx], color='r', linestyle='--', label='Max score')

ax.legend([f'Max score: {highest_name}: {highest_score:.4f}'])
plt.show()


In [None]:
scalers = [("MinMaxScaler",MinMaxScaler()), 
           ("StandarScaler",StandardScaler()), 
           ("Normalizer",Normalizer()), 
           ("PowerTransformer",PowerTransformer())]

for scaler in scalers:
    evaluate_model(("KNNImputer 5",KNNImputer(n_neighbors=5)), scaler, LogisticRegression(max_iter=10000), X_train, X_test, y_train)

## Data Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PowerTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Using a simple imputer TODO use the best
simple_imputer = SimpleImputer(strategy = "median")
df_imputed_simple = simple_imputer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(df_imputed_simple, y, test_size=0.33)
X_train.shape, X_test.shape

scalers = [(MinMaxScaler(),"MinMaxScaler"), (StandardScaler(),"StandardScaler"), (Normalizer(),"Normalizer"), (PowerTransformer(),"PowerTransformer")]

for scaler,name in scalers:
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = LogisticRegression(max_iter=10000)
    clf.fit(X_train_scaled, y_train)

    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='RB')
    recall = recall_score(y_test, y_pred, pos_label='RB')
    cm = confusion_matrix(y_test, y_pred)
    print("Results for:", name)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Confusion Matrix:\n{cm}\n")

## Feature Selection

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import explained_variance_score, mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, KFold
def model_testing(X_train, X_test, y_train, y_test, feature_selection=None):
    # select top 5 most correlated variables with y, if feature_selection is not None
    if feature_selection == 'correlation':
        corr_matrix = np.corrcoef(np.hstack((y_train.reshape((len(y_train), 1)), X_train)).T)
        corr_with_y = corr_matrix[0, 1:]
        correlationli = list(corr_with_y)
        sorted_indices = sorted(range(len(correlationli)), key=lambda k: correlationli[k], reverse=True)
        top5 = sorted_indices[:5]
        print("Top 5 by correlation:", top5)
        X_train = X_train[:, top5]
        X_test = X_test[:, top5]

    # train models
    dtr = DecisionTreeRegressor(max_depth=5)
    dtr.fit(X_train, y_train)

    lmr = LinearRegression()
    lmr.fit(X_train, y_train)

    # evaluate models
    dt_preds = dtr.predict(X_test)
    lr_preds = lmr.predict(X_test)

    print("RVE DTs: %7.4f" % explained_variance_score(y_test, dt_preds))
    print("RVE LRs: %7.4f" % explained_variance_score(y_test, lr_preds))

### Analysis of correlation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_imputed_simple, df_encoded_classes, test_size=0.33)
    
# use all variables
print("------Using all variables------")
model_testing(X_train, X_test, y_train, y_test)
print("------Using the top5------")
# use top 5 most correlated variables
model_testing(X_train, X_test, y_train, y_test, feature_selection='correlation')


### Stepwise - Forward and Backward

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

N,M=X_train.shape
#LR
lmr=LinearRegression()
sfs = SequentialFeatureSelector(lmr, n_features_to_select=5)
sfs.fit(X_train, y_train)

#get the relevant columns
features=sfs.get_support()
Features_selected =np.arange(M)[features]
print("The features selected are columns: ", Features_selected)

nX_train=sfs.transform(X_train)
nX_test=sfs.transform(X_test)

model_testing(nX_train, nX_test, y_train, y_test)
#--------------------------------------------------------------------
#DT Forward
dtr = DecisionTreeRegressor(max_depth=3)
sfs = SequentialFeatureSelector(dtr, n_features_to_select=5, direction="forward")
sfs.fit(X_train, y_train)

#get the relevant columns
features=sfs.get_support()
Features_selected =np.arange(M)[features]
print("Decision tree: Forward")
print("The features selected are columns: ", Features_selected)

nX_train=sfs.transform(X_train)
nX_test=sfs.transform(X_test)

model_testing(nX_train, nX_test, y_train, y_test)
#-------------------------------------------------------------------- takes a bit longer
#DT backward
dtr = DecisionTreeRegressor(max_depth=3)
sfs = SequentialFeatureSelector(dtr, n_features_to_select=5, direction="backward")
sfs.fit(X_train, y_train)

#get the relevant columns
features=sfs.get_support()
Features_selected =np.arange(M)[features]
print("Decision tree: Backward")
print("The features selected are columns: ", Features_selected)

nX_train=sfs.transform(X_train)
nX_test=sfs.transform(X_test)

model_testing(nX_train, nX_test, y_train, y_test)