# Практична робота №4
### Студентки групи МІТ-31 (підгрупа 1)
### Борук Дарини Ігорівни

##### 1st task

In [48]:
import pandas as pd
data = pd.read_csv('classification_dataset.csv')

X = data.drop('Target', axis=1)
y = data['Target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)


param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  #Regularization parameter
    'max_iter': [100, 200, 300, 400]  #Maximum number of iterations for the solver
}

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
#Create the GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
#Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

#Get the best parameters and best model from Grid Search
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

#Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)


from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
conf_matrix = confusion_matrix(y_test, y_pred)

print("Best Parameters:", best_params)
print("\nPerformance evaluation:")
print("* Accuracy: {:.2f}".format(accuracy))
print("* Recall: {:.2f}".format(recall))
print("* F1-Score: {:.2f}".format(f1))
print("* AUC-ROC: {:.2f}".format(roc_auc))
print("* Confusion Matrix:")
print(conf_matrix)


Best Parameters: {'C': 1, 'max_iter': 100}

Performance evaluation:
* Accuracy: 0.85
* Recall: 0.93
* F1-Score: 0.81
* AUC-ROC: 0.94
* Confusion Matrix:
[[21  5]
 [ 1 13]]


##### 2nd task

In [49]:
import pandas as pd
data = pd.read_csv('multiclass_dataset.csv')

X = data.drop('Target', axis=1)
y = data['Target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)


param_grid = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [50, 100, 150]
}

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=13)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)


from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
conf_matrix = confusion_matrix(y_test, y_pred)

print("Best Parameters:", best_params)
print("\nPerformance evaluation:")
print("* Accuracy: {:.2f}".format(accuracy))
print("* Recall: {:.2f}".format(recall))
print("* F1-Score: {:.2f}".format(f1))
print("* Confusion Matrix:")
print(conf_matrix)

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Performance evaluation:
* Accuracy: 0.72
* Recall: 0.72
* F1-Score: 0.72
* Confusion Matrix:
[[9 0 2 0]
 [2 9 0 0]
 [3 0 6 1]
 [0 3 0 5]]


##### 3rd task

In [50]:
import pandas as pd
data = pd.read_csv('multilabel_dataset.csv')

X = data[['Feature1', 'Feature2', 'Feature3']]
y = data[['Label1', 'Label2', 'Label3', 'Label4']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)


param_grid = {
    'estimator__max_depth': [10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__n_estimators': [50, 100, 150]
}

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
model = MultiOutputClassifier(RandomForestClassifier(random_state=13))
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, multilabel_confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
#The average parameter in precision_score, recall_score, and f1_score functions is set to None to compute scores for EACH label individually
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

conf_matrix = multilabel_confusion_matrix(y_test, y_pred)

#Round to hundredths
precision = [round(p, 2) for p in precision]
recall = [round(r, 2) for r in recall]
f1 = [round(f, 2) for f in f1]


print("Best Parameters:", best_params)
print("\nPerformance evaluation:")
print("* Accuracy: {:.2f}".format(accuracy))
print("* Precision: ", precision)
print("* Recall: ", recall)
print("* F1-Score: ", f1)
for i, confusion_matrix in enumerate(conf_matrix, start=1):
    print(f"\nConfusion matrix for Label {i}:\n{confusion_matrix}\n")

Best Parameters: {'estimator__max_depth': 10, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 5, 'estimator__n_estimators': 50}

Performance evaluation:
* Accuracy: 0.42
* Precision:  [0.67, 0.77, 0.84, 0.59]
* Recall:  [0.5, 0.85, 0.76, 0.45]
* F1-Score:  [0.57, 0.81, 0.8, 0.51]

Confusion matrix for Label 1:
[[30  2]
 [ 4  4]]


Confusion matrix for Label 2:
[[15  5]
 [ 3 17]]


Confusion matrix for Label 3:
[[16  3]
 [ 5 16]]


Confusion matrix for Label 4:
[[11  7]
 [12 10]]

