In [2]:
from sklearn import feature_extraction
from sklearn.model_selection import cross_validate, GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_selection import SelectKBest, chi2

import time

In [4]:
CLASSES = ['3.0', '4.0', '5.0']

# Data Visualisation

In [4]:
def scatter_vs_rating(df, feature):
    plt.scatter(df[feature], df[CLASS_LABEL])
    plt.xlabel(feature)
    plt.ylabel('rating')

In [5]:
def hist_plot(df, feature, bins=30):
    plt.hist(df[feature], bins=bins)
    plt.xlabel(feature)
    plt.ylabel('frequency')
    plt.show()

# Evaluation

In [6]:
# scoring for cross validation
def report_scoring(clf, X, y, include_avg=False): 
    """
    Returns a report of the classifier `clf`'s performance on the provided dataset.
    The report is a dictionary, which includes the precision, recall, f1-score, and accuracy.
    If `include_avg` is true, the report additionally includes the macro and weighted average 
    of the precision, recall, and f1-score.
    """
    y_pred = clf.predict(X)
    results = classification_report(y, y_pred, output_dict=True, zero_division=0)
    
    report_dict = {}  # set up our own output dictionary
    for label in CLASSES: 
        for metric in ['precision', 'recall', 'f1-score']: 
            report_dict[label + '_' + metric] = results[label][metric]
    report_dict['accuracy'] = results['accuracy']
    
    if include_avg:
        for metric in ['precision', 'recall', 'f1-score']:
            report_dict['macro_avg_' + metric] = results['macro avg'][metric]
            report_dict['weighted_avg_' + metric] = results['weighted avg'][metric]
    
    return report_dict

In [1]:
# Cross-validation used for hyperparameter tuning and model selection
def cross_val_report(clf, X, y, cv=10, print_full_results=True, print_confusion_matrix=True, predict=True):
    """
    Cross-validates the classifier on the given dataset.
    - cv: number of folds in a StratifiedKFold cross-validation.
    - print_full_results: whether to print the full cross-validation results.
    - print_confusion_matrix: whether to print the confusion matrix.
    Returns a 2-tuple consisting of:
    1) a list for the cross-validated estimates for each input data point
    2) a DataFrame for the aggregated cross-validation results.
    """
    
    start = time.time()
    # cross validation on TRAINING set
    result = cross_validate(clf, X, y, scoring=report_scoring, cv=cv)
    end = time.time()
    print(f"{get_model_name(clf)} prediction took {end - start} seconds.\n")
    
    # Convert evaluation results to a dataframe
    metrics = ['test_'+c+'_'+m for c in CLASSES for m in ['precision', 'recall', 'f1-score']]
    metrics += ['test_accuracy']
    results_df = pd.DataFrame.from_dict(result)[metrics].set_axis([metric[5:] for metric in metrics], axis=1)
    
    # Aggregate the results
    agg_results = pd.concat([results_df.mean(axis=0), results_df.std(axis=0, ddof=1)], axis=1).set_axis(['mean', 'std'], axis=1)
    
    if print_full_results:
        print(results_df)
    
    y_pred = []
    if predict: 
        y_pred = cross_val_predict(clf, X, y, cv=cv)
    
    if print_confusion_matrix:
        cm = confusion_matrix(y, y_pred)
        display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=CLASSES)
        display.plot()
        plt.show()
    
    return y_pred, agg_results

# Others

In [7]:
def get_model_name(model):
    """
    Returns the name of the model.
    """
    return re.findall(r'(\w+)\(', str(model))[0]

In [9]:
def tune_hyperparameter(clf, param_grid, X, y, scoring=report_scoring, cv=20, refit = False): 
    """
    
    """
    grid_search = GridSearchCV(clf, param_grid, scoring=scoring, cv=cv, verbose=2, refit=refit, error_score="raise")
    
    grid_search.fit(X, y)
    result_df = pd.DataFrame.from_dict(grid_search.cv_results_)
    r = re.compile('^((mean|std|rank)_test|params).*')
    score_metrics = list(filter(r.match, result_df.columns))
    
    return result_df[score_metrics]

In [None]:
def chi2_select_features(X, y, alpha=0.01):
    """
    Returns the selected features for the dataset using a chi-squared statistical test.
    """
    
    x2 = SelectKBest(chi2, k='all')
    x2.fit(X, y)
    pvals = pd.DataFrame(x2.pvalues_, index=x2.feature_names_in_, columns=['p-value'])
    x2_features = pvals[pvals['p-value'] < alpha].index.tolist()  # statistical test at ALPHA significance level
    return x2_features