In [None]:
# read the hotel reservation file
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
hotel = pd.read_csv('Hotel Reservations.csv')


In [None]:
#Check the columns and data types
hotel.info() 

In [None]:
# tendinta centrala + frequency of categorical and numerical variables
def central_tendency(df, col, categorical=False):
    if not categorical:
        print('Mean: ', df[col].mean())
        print('Median: ', df[col].median())
    print('Mode: ', df[col].mode())
    # print also the frequency of the mode
    print('Frequency: ', df[col].value_counts()[df[col].mode()[0]] / len(df[col]) * 100, '%')


for col in hotel.columns[1:]: # without the Booking_id column
    if hotel[col].dtype == 'object':
        central_tendency(hotel, col, categorical=True)
    else:
        central_tendency(hotel, col)
    print("---------------------")

In [None]:
def categorical_to_numeric(column):
    return column.astype('category').cat.codes

# imprastierea variabilelor numerice
def dispersion(df, col):
    print("Dispersion for column: ", col.name)
    print('Range: ', col.max() - col.min())
    print('Variance: ', col.var())
    print('Standard Deviation: ', col.std())
    #Skewness = 0 (normally distributed)
    #Skewness > 0 (positively skewed. Longer tail on the right side of the distribution)
    #Skewness < 0 (negatively skewed. Longer tail on the left side of the distribution)
    print('Skewness: ', col.skew())
    #Excess Kurtosis = Kurtosis - 3 (0 for normal distribution)
    # Leptokurtic: Kurtosis > 0 (more outliers than normal distribution)
    # Platykurtic: Kurtosis < 0 (less outliers than normal distribution)
    print('Kurtosis: ', col.kurt()) 
    

for col in hotel.columns[1:]: # without the Booking_id column
    if hotel[col].dtype != 'object':        
        dispersion(hotel, hotel[col])
    else:
        dispersion(hotel, categorical_to_numeric(hotel[col]))   
    print("---------------------")

    

In [None]:
# grafice (box plot)
def box_plot(col):
    plt.boxplot(col)
    plt.title(col.name)
    plt.show()

for col in hotel.columns[1:]: # without the Booking_id column
    if hotel[col].dtype != 'object':
        box_plot(hotel[col])
    else:
        box_plot(categorical_to_numeric(hotel[col]))

In [None]:
# histograma
def histogram(col):
    plt.hist(col)
    plt.title(col.name)
    plt.show()

for col in hotel.columns[1:]: # without the Booking_id column
    if hotel[col].dtype != 'object':
        histogram(hotel[col])

In [None]:
#density plot
def density_plot(col):
    col.plot(kind='density')
    plt.title(col.name)
    plt.show()

for col in hotel.columns[1:]: # without the Booking_id column
    if hotel[col].dtype != 'object':
        density_plot(hotel[col])

In [None]:
#Analiza bivariata

In [None]:
# calcul de corelatii (bivariata)
def corelation(col1, col2, name1, name2):
    print('Correlation between ', name1, ' and ', name2, ' is: ', col1.corr(col2))

for i in range(1,len(hotel.columns) - 1):
    for j in range(i+1, len(hotel.columns)):
        first_column = hotel[hotel.columns[i]].copy()
        second_column = hotel[hotel.columns[j]].copy()
        first_name = hotel[hotel.columns[i]].name
        second_name = hotel[hotel.columns[j]].name

        if first_column.dtype == 'object':
            first_column = categorical_to_numeric(first_column)
        if second_column.dtype == 'object':
            second_column = categorical_to_numeric(second_column)

        corelation(first_column, second_column, first_name, second_name)



OBS: - corelatie pozitiva(mare) intre: market_segment, avg_price_per_room  
                                        request_quest, no_of_previous_bookings

In [None]:
# Test de indepedenta???????

In [None]:
# Teste ce compara mai multe populatii (Testul mediilor)
# Testul Kruskal-Wallis
def kruskal_wallis_test(col1, col2, name1, name2):
    print('Kruskal-Wallis test between ', name1, ' and ', name2, ' is: ', stats.f_oneway(col1, col2).pvalue)


def t_test(col1, col2, name1, name2):
    print('T-test between ', name1, ' and ', name2, ' is: ', stats.ttest_ind(col1, col2).pvalue)


for i in range(1,len(hotel.columns) - 1):
    for j in range(i+1, len(hotel.columns)):
        first_column = hotel[hotel.columns[i]].copy()
        second_column = hotel[hotel.columns[j]].copy()
        first_name = hotel[hotel.columns[i]].name
        second_name = hotel[hotel.columns[j]].name

        if first_column.dtype == 'object':
            first_column = categorical_to_numeric(first_column)
        if second_column.dtype == 'object':
            second_column = categorical_to_numeric(second_column)

        t_test(first_column, second_column, first_name, second_name)

In [None]:
# scatter plot pe combinatii de 2 variabile (atribute numerice)
def scatter_plot(col1, col2, name1, name2):
    plt.scatter(col1, col2)
    plt.title(name1 + ' vs ' + name2)
    plt.xlabel(name1)
    plt.ylabel(name2)
    plt.show()

    
for i in range(1,len(hotel.columns) - 1):
    for j in range(i+1, len(hotel.columns)):
        first_column = hotel[hotel.columns[i]].copy()
        second_column = hotel[hotel.columns[j]].copy()
        first_name = hotel[hotel.columns[i]].name
        second_name = hotel[hotel.columns[j]].name

        if first_column.dtype == 'object':
            first_column = categorical_to_numeric(first_column)
        if second_column.dtype == 'object':
            second_column = categorical_to_numeric(second_column)

        scatter_plot(first_column, second_column, first_name, second_name)

In [None]:
# 3d grafic (atribute numerice)
def scatter_3d_plot(col1, col2, col3, name1, name2, name3):
    fig = plt.figure()
    # fig.add_subplot(111, projection='3d')
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(col1, col2, col3)
    ax.set_xlabel(name1)
    ax.set_ylabel(name2)
    ax.set_zlabel(name3)
    plt.show()

for i in range(1,len(hotel.columns) - 1):
    for j in range(i+1, len(hotel.columns)):
        for k in range(j+1, len(hotel.columns)):
            first_column = hotel[hotel.columns[i]].copy()
            second_column = hotel[hotel.columns[j]].copy()
            third_column = hotel[hotel.columns[k]].copy()
            first_name = hotel[hotel.columns[i]].name
            second_name = hotel[hotel.columns[j]].name
            third_name = hotel[hotel.columns[k]].name

            if first_column.dtype == 'object':
                first_column = categorical_to_numeric(first_column)
            if second_column.dtype == 'object':
                second_column = categorical_to_numeric(second_column)
            if third_column.dtype == 'object':
                third_column = categorical_to_numeric(third_column)

            # scatter_3d_plot(first_column, second_column, third_column, first_name, second_name, third_name)