<a href="https://colab.research.google.com/github/Ethan-sev/Airbnb_Score_Model/blob/main/Air_Learning_Clean_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [136]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
!pip install --upgrade tensorflow



Connect to google drive

In [102]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Read Cleaned CSV file

In [137]:
file_path = '/content/drive/MyDrive/Airbnb_Score_Model/Resources/airbnb_v3_encoded.csv'
df = pd.read_csv(file_path)

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 41 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   log_price                      74111 non-null  float64
 1   accommodates                   74111 non-null  float64
 2   bathrooms                      74111 non-null  float64
 3   host_response_rate             74111 non-null  float64
 4   number_of_reviews              74111 non-null  float64
 5   bedrooms                       74111 non-null  float64
 6   beds                           74111 non-null  float64
 7   amenities                      74111 non-null  int64  
 8   cleaning_fee                   74111 non-null  int64  
 9   property_type_Bed & Breakfast  74111 non-null  int64  
 10  property_type_Boutique hotel   74111 non-null  int64  
 11  property_type_Bungalow         74111 non-null  int64  
 12  property_type_Cabin            74111 non-null 

Adust Bin Size to desired preference

In [90]:
# # Define new bin edges
# new_bins = [0, 60, 80, 90, 100]

# # Define labels for the bins
# bin_labels = [0, 1, 2, 3]

# # Create a new column with the adjusted rating categories
# df['rating_category'] = pd.cut(df['rating_category'], bins=new_bins, labels=bin_labels, include_lowest=True)

Target and Features

In [141]:
y = df['rating_category']  # Target variable
X = df.drop(columns=['rating_category'])

In [142]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Random Forest, First Trial

In [143]:
# Initialize the model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42
)


In [144]:
model.fit(X_train, y_train)

In [145]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed report
print(classification_report(y_test, y_pred))

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       143
           1       0.00      0.00      0.00        72
           2       0.00      0.00      0.00       578
           3       0.95      1.00      0.97     14030

    accuracy                           0.95     14823
   macro avg       0.24      0.25      0.24     14823
weighted avg       0.90      0.95      0.92     14823

[[    0     0     0   143]
 [    0     0     0    72]
 [    0     0     0   578]
 [    0     0     0 14030]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Looking at Overfitting and Underfitting

In [146]:
# Evaluate on training data
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
print("Training Accuracy:", train_accuracy)

# Evaluate on test data
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)

Training Accuracy: 0.9478983942787748
Test Accuracy: 0.9465020576131687


# **XG Boost Trial**


In [147]:
!pip install xgboost



In [148]:
import xgboost as xgb

In [149]:
# Initialize the XGBoost model
model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [150]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9465020576131687
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       143
           1       0.00      0.00      0.00        72
           2       0.50      0.00      0.01       578
           3       0.95      1.00      0.97     14030

    accuracy                           0.95     14823
   macro avg       0.36      0.25      0.24     14823
weighted avg       0.92      0.95      0.92     14823



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Neural Network

In [151]:
from tensorflow.keras import backend as K

# Clear session to reset model state
K.clear_session()

In [152]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

In [153]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [154]:
# Initialize the model
model = Sequential()

# Add input layer and first hidden layer
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))

# Add more hidden layers
model.add(Dense(32, activation='relu'))

# Add dropout layer
model.add(Dropout(0.5))

# Add output layer
model.add(Dense(4, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [155]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [156]:
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_data=(X_test_scaled, y_test), verbose=1)

Epoch 1/50
[1m1853/1853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9151 - loss: 0.3649 - val_accuracy: 0.9465 - val_loss: 0.2461
Epoch 2/50
[1m1853/1853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9488 - loss: 0.2402 - val_accuracy: 0.9465 - val_loss: 0.2377
Epoch 3/50
[1m1853/1853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9466 - loss: 0.2425 - val_accuracy: 0.9465 - val_loss: 0.2347
Epoch 4/50
[1m1853/1853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9482 - loss: 0.2325 - val_accuracy: 0.9465 - val_loss: 0.2332
Epoch 5/50
[1m1853/1853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9469 - loss: 0.2331 - val_accuracy: 0.9465 - val_loss: 0.2333
Epoch 6/50
[1m1853/1853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.9476 - loss: 0.2290 - val_accuracy: 0.9465 - val_loss: 0.2323
Epoch 7/50
[1m1

In [122]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print("Test accuracy:", test_accuracy)

# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_classes = y_pred.argmax(axis=1)
# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred_classes))

Test accuracy: 0.9465020298957825
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       143
           1       0.00      0.00      0.00        72
           2       0.00      0.00      0.00       578
           3       0.95      1.00      0.97     14030

    accuracy                           0.95     14823
   macro avg       0.24      0.25      0.24     14823
weighted avg       0.90      0.95      0.92     14823



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Testing with different weights on each class

Making it an int to we can adust weights

In [159]:
y_train = np.array(y_train, dtype=int)
y_test = np.array(y_test, dtype=int)

Adust Weight, In order to help better recognize lower rated places

In [160]:
class_weights = {
    0: 20.0,
    1: 20.0,
    2: 20.0,
    3: 1.0
}

In [161]:
# Train the model with class weights
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test), verbose=1, class_weight=class_weights)

Epoch 1/10
[1m1845/1853[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8364 - loss: 2.2712

ValueError: Cannot take the length of shape with unknown rank.

In [62]:
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print("Test loss:", test_loss)
print("Test accuracy:", test_accuracy)

Test loss: 0.0
Test accuracy: 1.0


In [63]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test_scaled)

# Convert probabilities to class labels
y_pred_classes = np.argmax(y_pred_prob, axis=1)

[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [64]:
# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred_classes))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     14823

    accuracy                           1.00     14823
   macro avg       1.00      1.00      1.00     14823
weighted avg       1.00      1.00      1.00     14823



In [26]:
print("Unique classes in y_train:", np.unique(y_train))
print("Unique classes in y_test:", np.unique(y_test))

Unique classes in y_train: [0 1 2 3]
Unique classes in y_test: [0 1 2 3]


In [28]:
print("Unique classes in y_train:", np.unique(y_train))
print("Unique classes in y_test:", np.unique(y_test))

Unique classes in y_train: [0 1 2 3]
Unique classes in y_test: [0 1 2 3]


In [30]:
print("Class weights being used", class_weights)

Class weights being used: {0: 10.0, 1: 10.0, 2: 5.0, 3: 1.0}


In [74]:
model = Sequential()
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))  # Ensure this matches your number of classes

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with class weights
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_data=(X_test_scaled, y_test), verbose=1, class_weight=class_weights)

Epoch 1/50
[1m1853/1853[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9574 - loss: 2.9731 - val_accuracy: 1.0000 - val_loss: 1.6519e-07
Epoch 2/50
[1m 911/1853[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m2s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0115

KeyboardInterrupt: 