### **import libraries**

In [16]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix,log_loss
from sklearn.decomposition import IncrementalPCA
from sklearn.tree import DecisionTreeClassifier
from xgboost  import XGBClassifier
from sklearn.ensemble import StackingClassifier


# **Predefined methods**

for clean code

In [17]:
def plot_box(df,columns_names, size=15):
    """
    This method will be called to plot a box plot for each column to detect the outliers.
    df: dataframe
    columns_names: numric columns to plot (list)
    size: number of plot box figures (int)
    """
    plt.figure(figsize = [10,20])
    for i in range(1,size + 1):
        plt.subplot(5,3,i)
        sns.boxplot(data = df[columns_names].iloc[:,i-1], color= '#9546A2')
        plt.title(columns_names[i-1], fontsize = 20)
    plt.savefig('Boxpolt_aft_outliers.png')

In [18]:
'''def clean_outliers(df,columns_names,method=0):

  Method for clean outliers in given data

  arguments:
  df: dataframe that contain the data (dataframe)
  columns_names: numrical columns to clean (list)

  method: which method to use:
  0: drop outliers
  1: replace outliers with mean
  2: replace outliers with mode
  3: replace outliers with median
  4: replace outliers with min(for lower outliers ) max(for upper outliers)
  deafult=0
  (int)
  
  return dataframe after cleaning outliers

  for key in df[columns_names].keys():
    mean=df[key].mean()
    median=df[key].median()
    mode=df[key].mode()

    q25 = df[key].quantile(0.25)
    q75 = df[key].quantile(0.75)
    intr_qr = q75 - q25
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    
    if method==1:
      df.loc[(df[key] < min), key] = np.nan
      df.loc[(df[key] > max), key]= np.nan
      df.fillna(mean,inplace=True)
    elif method==2:
      df.loc[(df[key] < min), key] = np.nan
      df.loc[(df[key] > max), key]= np.nan
      df.fillna(mode,inplace=True)
    elif method==3:
      df.loc[(df[key] < min), key] = np.nan
      df.loc[(df[key] > max), key]= np.nan
      df.fillna(median,inplace=True)

    elif method==4:
      df.loc[(df[key] < min), key] = np.nan
      df.fillna(min,inplace=True)
      df.loc[(df[key] > max), key]= np.nan
      df.fillna(max,inplace=True)
    else:
      df.loc[(df[key] < min), key] = np.nan
      df.loc[(df[key] > max), key]= np.nan
      df.dropna(inplace=True)
      df.reset_index(drop=True,inplace=True)

    
  return df
'''

In [18]:
def data_preparation(df,numric_colums,cat_columns,tf_columns,scaler,test=False):
  '''
  Method for prepare the data for the  the ML model

  arguments:
  df: dataframe that contain the data  (dataframe)
  numric_colums: numrical columns to prepare (list)
  cat_columns: categorical columns to prepare (list)
  tf_columns:  true false columns to prepare (list)
  scaler: scaler used for scaling ()
  test: true for test data preparation False for train data preparation default False  (boolean)

  return dataframe prepared for ML model
  '''
  df=pd.get_dummies(df,cat_columns)#convert categorical data to 01
  df[tf_columns]=df[tf_columns].astype(int)#convert True false to 1 0
  if(test==False):
    df[numric_colums]=scaler.fit_transform(df[numric_colums])
    
  else:
    df[numric_colums]=scaler.transform(df[numric_colums])#scaling the data
  
  return df

  


In [19]:
def get_labels(df,label_colum):
  '''
  Method for convert our label to  
  0:unsatisfied
  1:neutral
  2:satisfied
  3:very satisfied

  arguments:
  df: dataframe that contain the labels to convert  (dataframe)
  label_colum: the name of label colum    (str)

  return the new labels
  '''
  label=pd.DataFrame() 
  temp_arr=np.zeros(len(df[label_colum]))
  i=0
  for item in df[label_colum]:
    if(item<60):
      temp_arr[i]=0
    elif (item<75):
      temp_arr[i]=1
    elif (item<83):
      temp_arr[i]=2
    else:
      temp_arr[i]=3
  i+=1
  label[label_colum]=temp_arr.tolist()
  return label

In [20]:
def get_true_label(y_pred):
  '''
  Method for convert model predictions from number to the real label

  arguments:
  y_pred: predtions of out model (ndarray)

  return list with name of the labels

  '''
  true_label=[]
  for i in range(len(y_pred)):
    if y_pred[i]==0:
      true_label.append('unsatisfied')
    elif y_pred[i]==1:
      true_label.append('neutral')
    elif y_pred[i]==2:
      true_label.append('satisfied')
    else:
      true_label.append('very satisfied')
  return true_label



# **Read data**

In [22]:
airbnb=pd.read_csv('airbnb.csv')

split the data to hide the test data

In [23]:
airbnb_train,airbnb_test=train_test_split(airbnb,test_size=0.2, random_state=42)


reset index to avoid problems

In [24]:
airbnb_train.reset_index(drop=True,inplace=True)
airbnb_test.reset_index(drop=True,inplace=True)

Split colums

In [25]:
numric_colums=['Price','Person Capacity','Cleanliness Rating','Bedrooms','City Center (km)',
                      'Metro Distance (km)','Attraction Index','Normalised Attraction Index','Restraunt Index','Normalised Restraunt Index']
cat_colums=['City','Day','Room Type']
tf_colmus=['Shared Room','Private Room','Superhost','Multiple Rooms','Business']

# **Data exploration**

In [None]:
airbnb_train.head()

In [None]:
airbnb_train.info()

In [None]:
airbnb_train.describe()

In [None]:
airbnb_train.hist(bins=50, figsize=(15,10))
plt.show()

In [None]:
corr_matrix=airbnb_train.corr()
corr_matrix['Guest Satisfaction'].sort_values(ascending=False)

In [None]:
plot_box(airbnb_train[numric_colums],numric_colums,len(numric_colums))

# **Feature engineering**(not used)

In [32]:
#some feature engineering with high corr features for train and test data
'''airbnb_train['Square Cleanliness Rating']=airbnb_train['Cleanliness Rating']**2
airbnb_train['Log Cleanliness Rating']=np.log(airbnb_train['Cleanliness Rating'])
airbnb_train['Cleanliness for Business']=airbnb_train['Cleanliness Rating']*airbnb_train['Business']
airbnb_train['Cleanliness for bedrooms']=airbnb_train['Cleanliness Rating']*airbnb_train['Bedrooms']
airbnb_train['bedrooms per Cleanliness']=airbnb_train['Bedrooms']/airbnb_train['Cleanliness Rating'] 

airbnb_test['Square Cleanliness Rating']=airbnb_test['Cleanliness Rating']**2
airbnb_test['Log Cleanliness Rating']=np.log(airbnb_test['Cleanliness Rating'])
airbnb_test['Cleanliness for Business']=airbnb_test['Cleanliness Rating']*airbnb_test['Business']
airbnb_test['Cleanliness for bedrooms']=airbnb_test['Cleanliness Rating']*airbnb_test['Bedrooms']
airbnb_test['bedrooms per Cleanliness']=airbnb_test['Bedrooms']/airbnb_test['Cleanliness Rating'] '''



update numreic colums after feature engineering (not used)

In [33]:
''''numric_colums=['Price','Person Capacity','Cleanliness Rating','Bedrooms','City Center (km)',
                      'Metro Distance (km)','Attraction Index','Normalised Attraction Index','Restraunt Index','Normalised Restraunt Index','Square Cleanliness Rating','Log Cleanliness Rating'
                      ,'Cleanliness for Business','Cleanliness for bedrooms','bedrooms per Cleanliness']'''

# **Data cleaning**(not used)

In [33]:
'''airbnb_train=clean_outliers(airbnb_train,numric_colums,method=0)
print('finish')'''


finish


In [None]:
airbnb_train

# **Data preparation**

convert the  label

In [36]:
y_train=get_labels(airbnb_train,'Guest Satisfaction')
y_test=get_labels(airbnb_test,'Guest Satisfaction')


Drop the label

In [37]:
X_train=airbnb_train.drop('Guest Satisfaction',axis=1)
X_test=airbnb_test.drop('Guest Satisfaction',axis=1)

In [None]:
X_train

In [39]:
scaler=StandardScaler()
X_train=data_preparation(X_train,numric_colums,cat_colums,tf_colmus,scaler,False)
X_test=data_preparation(X_test,numric_colums,cat_colums,tf_colmus,scaler,True)

# **Model selection**

In [40]:
tree=DecisionTreeClassifier()
tree.fit(X_train,y_train)
y_pred=tree.predict(X_test)
y_pred_train=tree.predict(X_train)

print('test accuracy',accuracy_score(y_test,y_pred))
print('train accuracy',accuracy_score(y_train,y_pred_train))



test accuracy 0.9998801390387151
train accuracy 1.0
