In [688]:
import pandas as pd
import yaml
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from dython import nominal


from sklearn.model_selection import train_test_split
# linear-regression
from sklearn.linear_model import LinearRegression

# decision dependencies
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import plot_tree

# random forest
from sklearn.ensemble import RandomForestRegressor

# Kmeans clustering
from sklearn.cluster import KMeans

# Kmodes
from kmodes.kprototypes import KPrototypes



from typing import List, Dict, Union, Any
sb.set()

# Exploratory Data Analysis Functions

## load_dataset()
- input: path to dataset folder
- returns: dataframe with datatypes formatted 
- description: Function will do the following tasks:
    - load the dataset (in this case Course_info.csv)
    - load the yaml to format the features to the appropiate dataypes (in this case data_types.yaml)
- remarks: None

In [None]:
def load_dataset(folder_path: str) -> pd.DataFrame:

    dataset: pd.DataFrame = pd.read_csv(f"{folder_path}/Course_info.csv")
    config: Dict[str, str] = yaml.safe_load(open(f"{folder_path}/data_types.yaml"))
    
    # assign the data types preset in the data_types.yaml
    for feature, datatype in config['data_types'].items():
        dataset[feature] = dataset[feature].astype(datatype)

    # for loop used to debug to ensure features are in the right data type
    # for column in dataset.columns:
    #     print(f"{column}: {dataset[column].dtype}")

    return dataset

## load_parameters()
- input: path of yaml file
- returns: parameters for the 
- description: will load the following parameters (in dictionary format):
    - include_features
    - Clean up
    - Types of EDA
    - 
- remarks: parameter file should contain the following details

In [None]:
def load_parameters(folder_path: str) -> Dict[str, Union[List, Dict[str, str]]]:

    parameters : Dict[str, Union[List, Dict[str, str]]] = yaml.safe_load(open(f"{folder_path}/parameters.yaml"))

    return parameters

## clean_up_dataset()
- input: 
    - Dataframe
    - parameters 
- returns: Dataframe
- description: Function does the following to the dataset
    - clean up 
    - drop row that contains NULL/NAN values
    - extract the interested features
- remarks: parameter input is dictionary data that loads from parameters.yaml file which contains all the configuration required for the clean up

In [689]:
def clean_up_dataset(
        dataframe: pd.DataFrame, 
        parameters: Dict[str, Union[List, Dict[str, str]]]
    ) -> pd.DataFrame:

    ## CLEAN UP RELATED
    # remove of rows based on constraint values set in the parameters 
    cleanup_parameters: Dict[str, int] = parameters['clean_up']
    
    for feature, constraint in cleanup_parameters.items():
        dataframe = dataframe[ dataframe[feature] > constraint]

    ## NULL/NAN RELATED
    # to print any null values
    # print(dataframe.isnull().sum())

    # drop any row that contains NULL/NAN values
    if dataframe.isnull().values.any():
        dataframe = dataframe.dropna()

    ## DUPLICATE RELATED
    # method to print duplicates on specific column
    # print(f"Duplicated instructors: \n{dataframe['instructor_name'].value_counts(ascending=False)}")
    # instructor_names = pd.DataFrame(dataframe['instructor_name'].value_counts(ascending=False))
    # instructor_names.to_csv('instructor_names.csv')

    # method to print duplicates exists on specific column
    # for columns in dataframe.columns:
    #     print(f"Duplicated {columns}: {dataframe[columns].duplicated().any()}")

    # total_duplicated_ids = dataframe[dataframe.duplicated('id', keep=False)]
    # print(f"Course with duplicated ids: {len(total_duplicated_ids)}")
    # add condition if there is duplicates

    ## EXTRACT FEATURE RELATED
    # create a new dataframe to extract the interested feature (set in the parameters)
    include_features: List[str] = parameters['include_features']
    extracted_dataset = pd.DataFrame( dataframe[include_features] )
    # extracted_dataset.info()

    return extracted_dataset.reset_index(drop=True)

## get_outlier_samples()
- inputs: Dataframe
- returns: Series of outliers based on supplied dataframe
- description: identify the outliers based on the supplied dataframe
- remarks: None

In [691]:
def get_outlier_samples(dataframe: pd.DataFrame) -> pd.core.series.Series:

    q1 = dataframe.quantile(0.25)
    q3 = dataframe.quantile(0.75)
    interquartile_range = q3-q1

    lower_whisker = q1-1.5*interquartile_range
    upper_whisker = q3+1.5*interquartile_range
    outliers: pd.core.series.Series = ((dataframe < lower_whisker) | (dataframe > upper_whisker))

    return outliers

## print_feature_outliers()
- inputs: Dataframe
- returns: None
- description: prints number of outliers for every numerical features/column
- remarks: None

In [692]:
def print_feature_outliers(dataframe: pd.DataFrame) -> None:

    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])

    for column in numerical_dataframe.columns:

        outliers = sum(get_outlier_samples(numerical_dataframe[column]))
        print(f"[{column}] total outliers: {outliers}")

## remove_outliers()
- inputs: Dataframe
- returns: Dataframe with outliers removed for every numerical feature/column 
- description: Function remove **UNION** outlier of the dataset. In other words remove the entire row containing outliers
- remarks: None

In [693]:
def remove_outliers(dataframe: pd.DataFrame) -> pd.DataFrame:

    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])

    union_outliers = (get_outlier_samples(numerical_dataframe)).any(axis=1)
    dataframe_with_outliers_removed: pd.DataFrame = dataframe[~union_outliers].reset_index(drop=True)

    # print(f"Total 'UNION' outliers: {sum(union_outliers)}")
    # instructor_names = pd.DataFrame(dataframe_with_outliers_removed['instructor_name'].value_counts(ascending=False))
    # instructor_names.to_csv('instructor_names.csv')
    return dataframe_with_outliers_removed

## generate_numerical_eda_visualization()
- inputs: 
    - dataframe
    - List of catergorical features/columns which are numerical types
    - plot title
- returns: None
- description: generates box, histo and violin plot for every numerical features (column) of the dataset
- remakrs: None

In [None]:
def generate_numerical_eda_visualization(
        dataframe: pd.DataFrame,
        plot_title: str
    ) -> None:
    
    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])
    total_features = len(numerical_dataframe.columns)
    figure, axes = plt.subplots(
        total_features, 
        3, 
        figsize=(24,4.8*total_features)
    )
    
    # figure.suptitle(plot_title, fontsize=20)
    axes[0, 1].set_title(plot_title, fontsize=25)
    row = 0
    for column in numerical_dataframe.columns:
        sb.boxplot(data=numerical_dataframe[column], orient='h', ax=axes[row,0])
        sb.histplot(data=numerical_dataframe[column], ax=axes[row,1])
        sb.violinplot(data=numerical_dataframe[column], orient='h', ax=axes[row,2])
        row = row + 1

## generate_categorical_count_visualization()
- input: 
    - Dataframe
    - parameters 
- returns: None
- description: generates categorical plot for selected catergorical features (column) of the dataset
- remarks: parameter input is dictionary data that loads from parameters.yaml file which contains all the configuration required for the catergorical count visualization

In [694]:
def generate_categorical_count_visualization(
        dataframe: pd.DataFrame,
        parameters: Dict[str, Union[List, Dict[str, str]]]
    ) -> None:

    visualization_parameters: List[str] = parameters['categorical_count_visualization']
    visualization_dataframe = pd.DataFrame( dataframe[visualization_parameters] )

    for column in visualization_dataframe.columns:
        category_total_types = len(dataframe[column].value_counts())        
        sb.catplot(y=column, data=visualization_dataframe, kind="count", height=category_total_types)
        plt.title(f"Total count for all types available in '{column}' feature", fontsize=20)

## generate_numerical_vs_categorical_eda_visualization()
- inputs: 
    - dataframe
    - parameters
    - plot title
- returns: None
- description: generates categorical plot for every catergorical features (column) of the dataset
- remarks: parameter input is dictionary data that loads from parameters.yaml file which contains all the configuration required to generate eda visualization

In [695]:
def generate_numerical_vs_categorical_eda_visualization(
        dataframe: pd.DataFrame,
        parameters: Dict[str, Union[List, Dict[str, str]]],
        plot_title: str
    ) -> None:

    visualization_parameters: Dict[str, str] = parameters['numerical_vs_categorical_eda_visualization']

    total_features = len(visualization_parameters)

    figure, axes = plt.subplots(
        total_features, 
        1, 
        figsize=(20,10*total_features),
        constrained_layout=True
    )
    figure.tight_layout(pad=10.0)

    row = 0
    for numerical_feature, categorical_feature in visualization_parameters.items():
        axes[row].set_title(
            f"{categorical_feature} boxplot based on {numerical_feature}", 
            fontdict={'fontsize': 25, 'fontweight': 'medium'}
        )

        sb_plot = sb.boxplot(
            y=numerical_feature, 
            x=categorical_feature, 
            data=dataframe, 
            order=dataframe.groupby(categorical_feature)[numerical_feature].median().sort_values().index,
            ax=axes[row]
        )
        sb_plot.set_xticklabels(sb_plot.get_xticklabels(), rotation=40, ha='right')

        row = row + 1

## generate_numerical_heatmap()
- inputs: 
    - dataframe
    - plot title
- returns: None
- description: generates heatmap for every numerical features (column) of the dataset
- remarks: None

In [696]:
def generate_numerical_heatmap(
        dataframe: pd.DataFrame,
        plot_title: str
    ) -> None:
    
    numerical_dataframe = dataframe.select_dtypes(include=['int64', 'float64'])

    plt.figure(figsize=(13, 13))
    plt.title(plot_title, fontsize=20)
    sb.heatmap(numerical_dataframe.corr(), vmin = -1, vmax = 1, linewidths = 1,
        annot = True, fmt = ".2f", annot_kws = {"size": 18}, cmap = "RdBu"
    )
    plt.show()

## generate_numerical_categorical_heatmap()
- inputs: dataframe
- returns: None
- description: generates heatmap for numerical and catergorical features (column) of the dataset
 heatmap produces by calculate the correlation/strength-of-association of features in data-set with both categorical and continuous features using: 
    - Pearson's R for continuous-continuous cases 
    - Correlation Ratio for categorical-continuous cases 
    - Cramer's V or Theil's U for categorical-categorical cases
- remarks: More info on the library checkout [dython](http://shakedzy.xyz/dython/modules/nominal/)

In [None]:
def generate_numerical_categorical_heatmap(
    dataframe: pd.DataFrame
) -> None:
    nominal.associations(dataset=dataframe, figsize=(13, 10), title="Correlation/Strength-of-association of features")

    # figure, axes = plt.subplots(
    #     figsize=(20,10),
    #     # constrained_layout=True
    # )
    # axes.set_title("Correlation/Strength-of-association of features")
    # nominal.associations(dataset=dataframe, ax=axes)
    # plt.show()
    

## generate_categorical_based_on_numerical_feature_heatmap NEED TO RENAME TO SOMETHING ELSE
- inputs: 
    - dataframe
    - y_axis: numerical feature
    - x_axis: catergorical feature
    - numerical_feature: the numerical feature to associate with different type of x and y axis
    - plot title
- returns: None
- description: generates categorical plot for every catergorical features (column) of the dataset
- remarks: None

In [697]:
def generate_categorical_based_on_numerical_feature_heatmap(
       dataframe: pd.DataFrame,
       y_axis: str,
       x_axis: str,
       numerical_feature: str,
       plot_title: str
    ):

    total_means = dataframe.groupby([y_axis, x_axis]).mean().loc[:, numerical_feature]

    # Display the TOP numerical feature in each pair type -- top 20
    print(total_means.reset_index().sort_values(numerical_feature, ascending=False).head(20).round(2))


    x_axis_total_types = len(dataframe[x_axis].value_counts())
    y_axis_total_types = len(dataframe[y_axis].value_counts()) 
    figure = plt.figure(figsize=(x_axis_total_types, y_axis_total_types))
    figure.suptitle(plot_title)
    figure.tight_layout()
    sb_plot = sb.heatmap(total_means.unstack(), linewidths = 1,
           annot = True, fmt = ".0f", annot_kws = {"size": 18}, cmap = "BuGn")
    sb_plot.set_xticklabels(sb_plot.get_xticklabels(), rotation=90, ha='right')

## generate_decision_tree
- inputs: 
    - dataframe
    - y_axis: numerical feature
    - x_axis: catergorical feature
    - numerical_feature: the numerical feature to associate with different type of x and y axis
    - plot title
- returns: None
- description: generates categorical plot for every catergorical features (column) of the dataset
- remarks: None

In [None]:
def generate_decision_tree(
        dataset: pd.DataFrame, 
        parameters: Dict[str, Union[List[str], Dict[str, Any]]]
    ) -> DecisionTreeClassifier:
    
    dataset.info()

    decision_tree_parameters: Dict[str, int] = parameters['multivariate_decision_tree']
    include_features: List[str] = decision_tree_parameters['predictors']
    include_features.append(decision_tree_parameters['response'])
    extracted_dataset = pd.DataFrame( dataset[include_features] )
    extracted_dataset.to_csv('extracted_dataset.csv')
    extracted_dataset.info()

    new_dataset = extracted_dataset.groupby(['instructor_name'], as_index=False).mean()
    new_column = extracted_dataset.groupby(['instructor_name']).size().reset_index(name='Number of courses')
    new_dataset['Number of courses'] = new_column['Number of courses']
    new_dataset.info()

    # new_dataset = extracted_dataset.groupby(['instructor_name'], as_index=False).mean() # doesnt givew the number of counts
    # new_dataset = extracted_dataset.value_counts(['instructor_name']).reset_index(name='counts') # this give count but miss out other cloumns
    # new_dataset = extracted_dataset.groupby(['instructor_name'], as_index=False).mean().count().reset_index(name='No of courses') # uses column names and add column count but no meaning to it
    # new_dataset = extracted_dataset.groupby(['instructor_name'], as_index=False).agg(['mean', 'count']) # this split the mean and count for every column. almost there but need more neater approach
    
    # This line works!
    # new_dataset = extracted_dataset.groupby(['instructor_name'], as_index=False).mean()
    # new_column = extracted_dataset.groupby(['instructor_name']).size().reset_index(name='Number of courses')
    # new_column = extracted_dataset.value_counts(['instructor_name']).reset_index(name='Number of courses')
    # print(type(new_column))
    # new_column.to_csv('new_column_dataset.csv')
    # new_dataset['Number of courses'] = new_column['Number of courses']
    # new_dataset = pd.concat([new_dataset, new_column[]], axis=1)
    # print(type(new_dataset))
    # new_dataset.to_csv('prediction_tree_dataset.csv')
    # end of working line!

    # need to check if there is null values
    # if new_dataset.isnull().values.any():
    #     new_dataset = new_dataset.dropna()
    # new_dataset.to_csv('null_drop_dataset.csv')
    # new_dataset.info()
    
    # new_dataset.head(10)


    # create classess based on range
    # labels=['0<=10', '11<=50', '51<=100', '101<=150', '151 and above']
    labels=['small', 'medium', 'large', 'x-large', 'super']
    bins=[0,10,50,100,150, np.inf]

#     labels=['0<=10', '11<=20', '21<=30', '31<=40', '41<=50','51<=60', '61<=70', '71<=80', '81<=90', '91<=100', '101 and above']
#     bins=[0,10,20,30,40,50,60,70,80,90,100, np.inf]

    new_dataset['Class'] = pd.cut(new_dataset['Number of courses'], bins=bins, labels=labels)
    # dataset['Class'] = dataset['Class'].astype('category')
    new_dataset.to_csv('add_class_dataset.csv')
    # nominal.associations(new_dataset)

    ## LINEAR REGRESSION
    # y = pd.DataFrame(new_dataset[['Number of courses']])
    # y.to_csv('y_dataset.csv')
    # print(include_features.pop())
    # X = pd.DataFrame(new_dataset[include_features])
    # X.to_csv('x_dataset.csv')
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    # linreg = LinearRegression()         # create the linear regression object
    # linreg.fit(X_train, y_train)        # train the linear regression model

    # print('Intercept of Regression \t: b = ', linreg.intercept_)
    # print('Coefficients of Regression \t: a = ', linreg.coef_)
    # print()

    # print(pd.DataFrame(list(zip(X_train.columns, linreg.coef_[0])), columns = ["Predictors", "Coefficients"]))
    # print()

    # y_train_pred = linreg.predict(X_train)
    # y_test_pred = linreg.predict(X_test)

    # f, axes = plt.subplots(1, 2, figsize=(24, 12))
    # axes[0].scatter(y_train, y_train_pred, color = "blue")
    # axes[0].plot(y_train, y_train, 'w-', linewidth = 1)
    # axes[0].set_xlabel("True values of the Response Variable (Train)")
    # axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
    # axes[1].scatter(y_test, y_test_pred, color = "green")
    # axes[1].plot(y_test, y_test, 'w-', linewidth = 1)
    # axes[1].set_xlabel("True values of the Response Variable (Test)")
    # axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
    # plt.show()

    ## DECISION TREEE

    # total_means = dataframe.groupby(['instructor_name']).mean().loc[:, numerical_feature]
    # Display the TOP numerical feature in each pair type -- top 20
    # print(total_means.reset_index().sort_values(numerical_feature, ascending=False).head(20).round(2))
    # decision_tree_parameters: Dict[str, int] = parameters['multivariate_decision_tree']
    # print(decision_tree_parameters)

    # decision tree will fail due to too many types in the instructor name
    include_features.pop()
    # response = pd.DataFrame(new_dataset['Class'])
    # response.to_csv("response_dataset.csv")
    # predictors = pd.DataFrame(new_dataset[include_features])
    # predictors.to_csv("predictors_dataset.csv")

    # X_train, X_test, y_train, y_test = train_test_split(predictors, response, test_size = 0.25)
    # dectree = DecisionTreeClassifier(max_depth = 50)
    # dectree.fit(X_train, y_train)

    # y_train_pred = dectree.predict(X_train)
    # y_test_pred = dectree.predict(X_test)

    # print("Goodness of Fit of Model \tTrain Dataset")
    # print("Classification Accuracy \t:", dectree.score(X_train, y_train))
    # print()

    # print("Goodness of Fit of Model \tTest Dataset")
    # print("Classification Accuracy \t:", dectree.score(X_test, y_test))
    # print()

    # f, axes = plt.subplots(1, 2, figsize=(12, 4))
    # sb.heatmap(confusion_matrix(y_train, y_train_pred),
    #         annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
    # sb.heatmap(confusion_matrix(y_test, y_test_pred), 
    #         annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])
    
    # somehow decision tree fails to plot due to IndexError: list index out of range
    # print(X_train.columns)
    # print(labels)
    # f = plt.figure(figsize=(12,12))
    # plot_tree(dectree, filled=True, rounded=True, 
    #     feature_names=['price'], 
    #     class_names=['TRUE', 'FALSE']
    # )
    

    ## KMODES
    
    categorical_features_idx = [5]
    mark_array=extracted_dataset.values
    print(f"{mark_array}")
    kproto = KPrototypes(n_clusters=10, n_init=5, verbose=2).fit(mark_array, categorical=categorical_features_idx)
    print(kproto.cluster_centroids_)

    clusters = kproto.predict(mark_array, categorical=categorical_features_idx)
    extracted_dataset['Cluster'] = list(clusters)
    print(extracted_dataset.groupby('Cluster').size())
    extracted_dataset.to_csv('k-proto.csv')

    fig, ax = plt.subplots()
    fig.set_size_inches((20, 10))
    scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=clusters, cmap='tab20b', alpha=1.0)

    # produce a legend with the unique colors from the scatter
    legend1 = ax.legend(*scatter.legend_elements(num=15),
                        loc="lower left", title="Classes")
    ax.add_artist(legend1)