In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.ensemble import IsolationForest, RandomForestRegressor, VotingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, f_regression
from skopt import BayesSearchCV

In [None]:
# Load the data
data = pd.read_csv("/Users/elenalickel/Desktop/Machine Learning /Airbnb_NYC_2019.csv")

In [None]:
# Handle missing values
data.fillna(0, inplace=True)  # Adjust as necessary for your analysis
# Convert the data to string, handling non-string and missing values
data['name'] = data['name'].fillna('')  # Replace NaN with empty string
data['name'] = data['name'].apply(lambda x: str(x) if not isinstance(x, str) else x)
# Convert the data to string, handling non-string and missing values
data['host_name'] = data['host_name'].fillna('')  # Replace NaN with empty string
data['host_name'] = data['host_name'].apply(lambda x: str(x) if not isinstance(x, str) else x)

In [None]:
print(data)

In [None]:
# Exploratory analysis
print(data.columns)
numeric_data = data.select_dtypes(include=[np.number])
correlation_matrix = numeric_data.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()

# Convert categorical variables using one-hot encoding
categorical_data = pd.get_dummies(data[['neighbourhood_group', 'room_type']], drop_first=True)
numeric_data_with_categorical = pd.concat([numeric_data, categorical_data], axis=1)

# Updated correlation matrix with categorical data
correlation_matrix = numeric_data_with_categorical.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()

In [None]:
# Text mining on 'name' and 'host_name' using TfidfVectorizer
imputer = SimpleImputer(strategy='constant', fill_value=' ')

# Fit and transform, then flatten the array to 1D for 'name'
data['name'] = imputer.fit_transform(data[['name']]).ravel()
# Fit and transform, then flatten the array to 1D for 'host_name'
data['host_name'] = imputer.fit_transform(data[['host_name']]).ravel()

tfidf_vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
name_features = tfidf_vectorizer.fit_transform(data['name']).toarray()
host_name_features = tfidf_vectorizer.fit_transform(data['host_name']).toarray()

# Combine these features with existing numeric features
combined_features = np.hstack((numeric_data_with_categorical.values, name_features, host_name_features))

# Fit LDA to the name features only
lda = LatentDirichletAllocation(n_components=10)
lda_features = lda.fit_transform(name_features)
print(lda_features)

In [None]:
# Automated outlier detection
iso_forest = IsolationForest(contamination=0.01)
outliers = iso_forest.fit_predict(data[['price']])
data = data[outliers != -1]

# Impute missing values across all numeric features
data_no_price = data.drop('price', axis=1)  # Exclude the 'price' column
imputer = KNNImputer(n_neighbors=5)  # Impute missing values based on 5 nearest neighbors
numeric_data_imputed = imputer.fit_transform(data_no_price.select_dtypes(include=['float64', 'int64']))  # Apply imputation to numeric features

# Apply PCA for dimensionality reduction
pca = PCA(n_components=5)  # Reduce the dimensionality to 5 principal components
principal_components = pca.fit_transform(numeric_data_imputed)  # Apply PCA transformation to the imputed numeric data

In [None]:
# Model training xgboost, randomforest, supportvector
X_train, X_test, y_train, y_test = train_test_split(principal_components, data['price'], test_size=0.2, random_state=42)

# Individual model setup
xgb = XGBRegressor()
rf = RandomForestRegressor()
svm = make_pipeline(StandardScaler(), SVR())

# Train each model separately
xgb.fit(X_train, y_train)
rf.fit(X_train, y_train)
svm.fit(X_train, y_train)

# Predictions for each model
predictions_xgb = xgb.predict(X_test)
predictions_rf = rf.predict(X_test)
predictions_svm = svm.predict(X_test)

# Calculate and print MAE for each model
mae_xgb = mean_absolute_error(y_test, predictions_xgb)
mae_rf = mean_absolute_error(y_test, predictions_rf)
mae_svm = mean_absolute_error(y_test, predictions_svm)

print(f'XGBRegressor Mean Absolute Error: {mae_xgb}')
print(f'RandomForestRegressor Mean Absolute Error: {mae_rf}')
print(f'Support Vector Machine Mean Absolute Error: {mae_svm}')

In [None]:
# NN
from scikeras.wrappers import KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_nn_model(n_units=128, dropout_rate=0.2):
    model = Sequential([
        Dense(n_units, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(dropout_rate),
        Dense(int(n_units / 2), activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

nn_model = KerasRegressor(model=build_nn_model, epochs=100, batch_size=32)


In [None]:
# Voting with NN
from sklearn.ensemble import VotingRegressor

# Setup the voting regressor with the NN model included
voting_regressor = VotingRegressor(
    estimators=[
        ('xgb', xgb),
        ('rf', rf),
        ('svm', svm),
        ('nn', nn_model)  # Include the NN model
    ]
)

# Train the voting regressor
voting_regressor.fit(X_train, y_train)

# Predict and evaluate
predictions = voting_regressor.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f'Combined Mean Absolute Error: {mae}')


In [None]:
# Prepare data for model training - Voting without NN
X_train, X_test, y_train, y_test = train_test_split(principal_components, data['price'], test_size=0.2, random_state=42)

# Ensemble models and voting regressor setup
xgb = XGBRegressor()
rf = RandomForestRegressor()
svm = make_pipeline(StandardScaler(), SVR())
voting_regressor = VotingRegressor(estimators=[('xgb', xgb), ('rf', rf), ('svm', svm)])

# Train and predict
voting_regressor.fit(X_train, y_train)
predictions = voting_regressor.predict(X_test)

# Model evaluation
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error: {mae}')

In [None]:
# Define search spaces for Bayesian optimization
search_spaces = {
    'rf__n_estimators': (10, 200),
    'rf__max_depth': (5, 50),
    'xgb__n_estimators': (10, 200),
    'xgb__learning_rate': (0.01, 0.5),
    'svm__svr__C': (0.1, 1000),
    'svm__svr__gamma': (0.001, 1.0),
    'nn__n_units': (50, 200), 
    'nn__dropout_rate': (0.1, 0.5) 
}


from sklearn.ensemble import VotingRegressor

# Add the neural network to the voting regressor
voting_regressor = VotingRegressor(
    estimators=[
        ('xgb', xgb),
        ('rf', rf),
        ('svm', svm),
        ('nn', nn_model)
    ]
)

from skopt import BayesSearchCV

# Bayesian optimization
bayes_search = BayesSearchCV(estimator=voting_regressor, search_spaces=search_spaces, n_iter=32, cv=3)
bayes_search.fit(X_train, y_train)


Validation

In [None]:
# Validation
# Load the validation dataset
validation_data = pd.read_csv('/Users/elenalickel/Desktop/Machine Learning /Airbnb_NYC_2019_eval_no_price.csv')

# Handle missing values for non-text features
validation_data.fillna(0, inplace=True)

# Handle missing values for text features and ensure all data is of type string
validation_data['name'] = validation_data['name'].fillna('').apply(lambda x: str(x))
validation_data['host_name'] = validation_data['host_name'].fillna('').apply(lambda x: str(x))

# Initialize and fit SimpleImputer on the training data
name_imputer = SimpleImputer(strategy='constant', fill_value='Missing Name')
host_name_imputer = SimpleImputer(strategy='constant', fill_value='Missing Host')
name_imputer.fit(data[['name']])
host_name_imputer.fit(data[['host_name']])

# Transform validation data using the already fitted imputers
validation_data['name'] = name_imputer.transform(validation_data[['name']]).ravel()
validation_data['host_name'] = host_name_imputer.transform(validation_data[['host_name']]).ravel()

# Initialize TfidfVectorizer and fit on the training data
name_tfidf_vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
host_name_tfidf_vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
name_tfidf_vectorizer.fit(data['name'].fillna('').apply(str))
host_name_tfidf_vectorizer.fit(data['host_name'].fillna('').apply(str))

# Transform text features in the validation dataset using the fitted vectorizers
name_features_val = name_tfidf_vectorizer.transform(validation_data['name']).toarray()
host_name_features_val = host_name_tfidf_vectorizer.transform(validation_data['host_name']).toarray()

# Select only the numeric features that were also present during the fitting of the imputer
numeric_features_train = data.select_dtypes(include=[np.number]).drop(columns=['price'], errors='ignore')
numeric_features_val = validation_data[numeric_features_train.columns]

# For the training data, exclude 'price' if it's included

numeric_imputer = KNNImputer(n_neighbors=5)
numeric_imputer.fit(numeric_features_train)

# For the validation data, ensure you're using the same features
if 'price' in numeric_features_train.columns:
    numeric_features_train = numeric_features_train.drop(columns=['price'], errors='ignore')
numeric_features_val = validation_data[numeric_features_train.columns]  # This should no longer throw an error

# Now apply the imputer to the validation data
numeric_data_imputed_val = numeric_imputer.transform(numeric_features_val)


# Assuming PCA is already fitted on the training data
validation_principal_components = pca.transform(numeric_data_imputed_val)

# Predict on the validation set using the trained Voting Regressor
validation_predictions = voting_regressor.predict(validation_principal_components)
print("Validation Predictions:")
print(validation_predictions)


In [None]:
# Add the predictions as a new column in the validation_data DataFrame
validation_data['price'] = validation_predictions


In [None]:
print(validation_data.columns)
# Define the path where you want to save the file
file_path = '/Users/elenalickel/Desktop/Machine Learning /Airbnb_NYC_2019_price_predictions.csv'

# Save the DataFrame to a CSV file
validation_data.to_csv(file_path, index=False)
