In [11]:
#Cell_1_1:imports


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

import matplotlib.pyplot as plt







##### **PART1**
Update Dataset and save new file

In [None]:
#Cell_1_2:unit test-part1


def df_equal(df1, df2):
    """
    :param df1: pandas dataframe object
    :param df2: pandas dataframe object
    :return: True if all elements equel, False otherwise(bool)
    """
    return (df1 == df2).to_numpy().all()


def check_for_null_and_nan(df):
    """
    :param df: pandas dataframe object
    :return: if there any None of NaN values
    """
    return not (df.isnull().to_numpy().any() or
                df.isna().to_numpy().any())


def test_updated_df(old, new):
    """
    Assert if for old data unchanged and for no null/nan values
    :param old:pandas DataFrame object before change 
    :param new:pandas DataFrame object after change 
    :return: None
    """
    assert len(old) == len(new)
    assert df_equal(new[old.columns], old)
    assert check_for_null_and_nan(new)


def test_new_hotels_file(path, hotels_data):
    """
    Asserts no data loss on file save
    :param path: file relative path (string)
    :param hotels_data: 
    :return: None
    """
    assert (pd.read_csv(path) == hotels_data).to_numpy().all()


def test_update_function_values(update_func):
    """
    Asserts correctness of the update function on dummy data
    :param update_func: function for data update
    :return: None
    """
    dummy_data = {
        "Snapshot Date": ["7/20/2015 0:00"],
        "Checkin Date": ["7/21/2015 0:00"],
        "Original Price": [1000],
        "Discount Price": [800]
    }

    result = update_func(pd.DataFrame(dummy_data))

    assert (result["DayDiff"] == 1).all()
    assert (result["WeekDay"] == 3).all()
    assert (result["DiscountDiff"] == 200).all()
    assert (result["DiscountPerc"] == 20.0000).all()


def run_test1(path, update_func,updated_hotels):
    """
    Run unit test on data
    :param path: relative path to csv file
    :param update_func:function for data update
    :return: 
    """

    test_update_function_values(update_func)

    

    test_updated_df(hotels, updated_hotels)

    test_new_hotels_file(path, updated_hotels)





In [None]:
#Cell_1_3:update and save anew file

def update_hotel_data(df):
    #day diff saved as int
    df["DayDiff"] = (pd.to_datetime(df["Checkin Date"]) - pd.to_datetime(df["Snapshot Date"])).dt.days

    #save the weekday as int(1-7)
    df["WeekDay"] = (pd.to_datetime(df["Checkin Date"]).dt.dayofweek + 2) % 7

    #save the discaount diff 
    df["DiscountDiff"] = df["Original Price"] - df["Discount Price"]

    #save the discount percentage
    df["DiscountPerc"] = np.around(100 * df["DiscountDiff"] / df["Original Price"], 4)

    return df


#update and save
hotels = pd.read_csv("hotels_data.csv")
updated_hotels = update_hotel_data(hotels)
updated_hotels.to_csv("./hotels_data_changed.csv", index=False)


run_test1("./hotels_data_changed.csv",update_hotel_data,updated_hotels)




##### **PART 2**
Naive Bayes and Descision Tree

In [None]:
#Cell_2_1:unit test for data preparation

#test helper function
def test_data_preparation(prepare_train):
    """
    The functiom assert correct data transformation on dummy daya
    :param1:data_trandform a function
    :return:None
    """
    dummy = pd.DataFrame({
        "Snapshot Date": ["7/20/2015 0:00", "8/20/2015 0:00", "8/20/2015 0:00"],
        "Checkin Date": ["7/21/2015 0:00", "8/21/2015 0:00", "8/21/2015 0:00"],
        "Hotel Name": ["Hotel1", "Hotel2", "Hotel2"],
        "DayDiff": [1, 1, 1],
        "WeekDay": [3, 4, 4],
        "DiscountPerc": [10, 20, 5],

    })

    transformd = pd.DataFrame({
        "Snapshot Date": [0, 31],
        "Checkin Date": [0, 31],
        "Hotel Name": [0, 1],
        "DayDiff": [1, 1],
        "WeekDay": [3, 4],
        "DiscountPerc": [10, 20]

    })

    #use function from cell2

    assert df_equal(prepare_train(dummy), transformd)


In [None]:
#Cell_2_2:data preparation fro training

def get_max_groupd_by(df, group_cols, max_col):
    """ 
    The functon group dataframe by given columns and get maximum  
    of a given additional column per group.
    :param df: pandas DataFrame object
    :param group_cols: column names (list of strings )
    :param max_col: column name (string)
    :return:Grouped pandas DataFrame object
    """
    return df.loc[df.groupby(group_cols)[max_col].idxmax()]


def transform_date(col):
    """ 
    Convert date value foro integer as days passed from first date.
    Used with df.apply().
    :param col: column name (str) representing string date values
    :return:days passed from minimal date (int).
    """
    new_col =pd.to_datetime(col)
    return (new_col - min(new_col)).dt.days


def col_to_labels(df, col_name):
    """
    Get integer labels from column
    :param df: pandas DataFrame object
    :param col_name:the name of the column (int)
    :return:list of integer labels (list of int)
    """
    le = preprocessing.LabelEncoder()
    col = df[col_name]
    le.fit(col)
    return le.transform(col)


#prepare data for training
def prepare_train_data(df):
    #get the rows with max discount percentage per group
    df = get_max_groupd_by(df, ["Snapshot Date", "Checkin Date", "DayDiff", "Hotel Name", "WeekDay"], "DiscountPerc")

    #convert names to integer labels
    df["Hotel Name"] = col_to_labels(df, "Hotel Name")

    #update date values to integers
    df[["Snapshot Date", "Checkin Date"]] = df[["Snapshot Date", "Checkin Date"]].apply(transform_date)

    return df


#test preparation function and save
test_data_preparation(prepare_train_data)



In [None]:
#Cell_2_3:define metrics and prepare data

def check_accuracy(clf, x_train, y_train, x_test, y_test):
    """
    Fit the training data into a classifier and print number of mislabeled 
    examples and accuracy rate. 
    :param clf: classifier
    :param x_train:training examples,array of shape (n_examples,n_features)
    :param y_train:training labels,array of shape (n_examples)
    :param x_test:test examples,array of shape (n_examples,n_features)
    :param y_test:test labels,array of shape (n_examples)
    :return:None (print accuracy)
    """
    y_pred = clf.fit(x_train, y_train).predict(x_test)
    print(f"Accuracy for {clf} classifier:")
    print(f"Number of mislabeled points out of a total {x_test.shape[0]} points : {(y_test != y_pred).sum()}")
    print(f"Accuracy={round((1 - (y_test != y_pred).sum() / x_test.shape[0]) * 100, 2)}%")


def plot_multiclass_roc(clf, x_train, y_train, x_test, y_test, classes=(1, 2, 3, 4)):
    """
    plot a roc curve for every class
    :param clf: classifier
    :param x_train:training examples,array of shape (n_examples,n_features)
    :param y_train:training labels,array of shape (n_examples)
    :param x_test:test examples,array of shape (n_examples,n_features)
    :param y_test:test labels,array of shape (n_examples)
    :param classes:array
    :return:None (create a plot)
    """
    #init roc comp dict
    n_classes = len(classes)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    #predict probabilities using the classifier
    y_pred = clf.fit(x_train, y_train).predict_proba(x_test)
    # use binary labels (no multiclass support)
    y_test = label_binarize(y_test, classes=classes)

    for i in range(n_classes):
        fpr[i], tpr[i], _ = np.nan_to_num(roc_curve(y_test[:, i], y_pred[:, i]), 0)
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    lw = 2

    # aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # average and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure()
    plt.plot(
        fpr["micro"],
        tpr["micro"],
        label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
        color="deeppink",
        linestyle=":",
        linewidth=4,
    )

    plt.plot(
        fpr["macro"],
        tpr["macro"],
        label="macro-average ROC curve (area = {0:0.2f})".format(roc_auc["macro"]),
        color="navy",
        linestyle=":",
        linewidth=4,
    )

    colors = cycle(["aqua", "darkorange", "cornflowerblue"])
    for i, color in zip(range(n_classes), colors):
        plt.plot(
            fpr[i],
            tpr[i],
            color=color,
            lw=lw,
            label="ROC curve DiscountCode: {0} (area = {1:0.2f})".format(i + 1, roc_auc[i]),
        )

    plt.plot([0, 1], [0, 1], "k--", lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("FP Rate")
    plt.ylabel("TP Rate")
    plt.title(f"Multiclass ROC plot for {clf}")
    plt.legend(loc="lower right")
    plt.show()


hotels = pd.read_csv("./hotels_data_changed.csv")

#prepare data for learning
hotels = prepare_train_data(hotels)

#get 
X = hotels[["Snapshot Date", "Checkin Date", "DayDiff", "Hotel Name", "WeekDay"]].to_numpy()

y = hotels["Discount Code"].to_numpy()

#create training data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
#Cell_2_4:run and test GaussianNB
nbgc = GaussianNB()
check_accuracy(nbgc, x_train, y_train, x_test, y_test)
plot_multiclass_roc(nbgc, x_train, y_train, x_test, y_test)


In [None]:
#Cell_2_5:run and test CategoricalNB
nbcc = CategoricalNB()
check_accuracy(nbcc, x_train, y_train, x_test, y_test)
plot_multiclass_roc(nbcc, x_train, y_train, x_test, y_test)

In [None]:
#Cell_2_6:run and test Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=0, criterion='gini')
check_accuracy(dtc, x_train, y_train, x_test, y_test)
plot_multiclass_roc(dtc, x_train, y_train, x_test, y_test)


#### conclusions-part 2

The data fitted into 3 classifiers: 2 types of Naive Bayes Classifier
and one Decision Tree classifier.

*The data was transformed allowing string(dates and name) variables to be fitted.
Few alternative transformation was considered (like day-month only date variable to capture periodicity)
 with no increase in performance.


**Naive Bayes classifier** perform poorly on the data set.
The **Gaussian NB classifier** perform only slightly better than a random selection with around 30% accuracy on the data set.
The ROC plot shows  AUC of 0.55, again very close to 1/2 of a random choice,the result shown here:

![Naive Bayes classifier](images/nbgc.png)


Naive Bayes failure may not be surprise as it assumes variable independence which may not accuse here (pricing policy may depend on the hotel,week-day derived from checkin date), also most of the variables don't correspond to gaussian(normal) distribution (Checkin Date do show some correspondence)
Distributions shown here:

![Hotel Name Dist](images/hotel_name_dist.png)![Snapshot Date Dist](images/snapshot_date_dist.png)![Checkin Date Dist](images/checkin_date_dist.png)![Weekday Dist](images/week_day_dist.png)![Day Diff Dist](images/day_diff_dist.png)


**Categorical NB classifier** perform slightly better around 43% accuracy and with AUC of around 0.7 as shown here:

![Categorical NB classifier](images/nbcc.png)

The categorical classifier fit slightly better as the categorical distribution (generalized Bernoulli) may fit some of the variables better.

**Decision Tree Classifier** perform better at 78% accuracy and AUC of 0.85 which may be suitable for some applications:

![Decision Tree Classifier](images/dtc.png)

Decision Tree perform better arguebly because it can learn variable dependancies in contrast to NB.



#### **PART 3**
Hierarchical Clustering

In [None]:
#Cell_3_1:create hotels -prices dataframe

def get_first_groupd_by(df, group_cols, first_col):
    """ 
    The functon group dataframe by given columns and get the 
    first entry  of a given additional column per group.
    :param df: pandas DataFrame object
    :param group_cols: column names (list of strings )
    :param first_col: column name (string)
    :return:
    """
    return df.loc[df.groupby(group_cols)[first_col].first()]


def get_most_frequent_values(df, col_name, n):
    """ 
    The function find the n most frequent values by column
    :param df: pandas DataFrame object
    :param col_name: The name of the column (string)
    :n: number of elements (int)
    :return:
    """
    return df[col_name].value_counts().index.to_numpy()[:n]


df = pd.read_csv("hotels_data.csv")

#find 150 most frequent hotels
#sorting ensures alphabetic order
hotel_names = np.sort(get_most_frequent_values(df, "Hotel Name", 150))

#update df by hotels and chekin dates
df = df.loc[df["Hotel Name"].isin(hotel_names)]

assert len(df["Hotel Name"].unique()) == 150

#find 40 most frequent checkin dates and update
checkin_dates = get_most_frequent_values(df, "Checkin Date", 40)
df = df.loc[df["Checkin Date"].isin(checkin_dates)]

assert (len(df["Checkin Date"].unique())) == 40

#get only the relevant columns
df = df[["Hotel Name", "Checkin Date", "Discount Code", "Discount Price"]]

#get max value of discount price for consistency and group by relevant values
#maximum value chosen as a consistency point, random value like first may work as well.
df = get_max_groupd_by(df, ["Hotel Name", "Checkin Date", "Discount Code"], "Discount Price")



#normalize for [0,100] range
df["Discount Price"] = 100 * df["Discount Price"] / df["Discount Price"].max()

#ensures hotel alphabetic order
df.sort_values(["Hotel Name"], ascending=True)

#create a dict for full range of possible values
hotels_dict = {"Hotel Name": hotel_names}
for i in range(1, 5):
    for date in checkin_dates:
        #mark every checkin date-key with discount code
        hotels_dict[f"{date}-{i}"] = []

    #fill the dictionary values iterating original df rows
for row in df.iterrows():
    name, date, code, price = row[1]
    hotels_dict[f"{date}-{code}"].append(price)

#creating hotels df with missing values as NaN
hotels = pd.DataFrame({key: pd.Series(value) for key, value in hotels_dict.items()})

#fill the missing values with -1
hotels = hotels.fillna(-1)



In [None]:
#Cell_3_2:run clustering

import scipy.cluster.hierarchy as sch
from matplotlib import pyplot as plt

X = hotels.drop(["Hotel Name"], axis=1)




# finding the optimal number of clusters using dendrogram
dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))

plt.title('Hotels Dendrogram')  # title of the dendrogram
plt.xlabel('Hotels')  # label of the x-axis
plt.ylabel('Euclidean distances')  # label of the y-axis
plt.show()  # show the dendrogram

#### conclusions-part 3

The dendrogram shown here:
![Decision Tree Classifier](images/dendogram.png)

Ward method(minimum variance) used.
From the dendrogram we can assert the number of clusters:
Using horizontal cut we can assert 2 bigger clusters (distant more then a half way from each other) and maybe 2-3 evident subclasters for each of the bigger.

It may correspond to number of start or other hotel clusters (luxury vs regular hotels).



