Key libraries for this project include:

Pandas for data manipulation.
Scikit-learn for machine learning models and preprocessing tools.
Numpy for numerical operations.
Imbalanced-learn's SMOTE for handling imbalanced datasets.

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import LogisticRegressionCV
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [2]:
#Reading the Data
df = pd.read_csv('Detail_listings_cleaned.csv', encoding='latin1')

In [3]:
#Creating a new dataset for amenities as it needs to be cleaned and transformed into columns
amenities = df[['amenities']]

In [4]:
df['city'].isna().sum()

np.int64(10)

In [5]:
#Checking how many classes would be created
len(set(df['city']))

349

In [6]:
#Remapping Cities to actual cities not suburbs
city_mapping = {
    'Hollywood': 'Los Angeles',
    'Los Angeles': 'Los Angeles',
    'Porter Ranch': 'Los Angeles',
    'Venice': 'Los Angeles',
    'Acton': 'Los Angeles',
    'Agoura Hills': 'Agoura Hills',
    'Agua Dulce': 'Agua Dulce',
    'Alhambra': 'Alhambra',
    'Altadena': 'Altadena',
    'Arcadia': 'Arcadia',
    'Artesia': 'Artesia',
    'Avalon': 'Avalon',
    'Azusa': 'Azusa',
    'Baldwin Park': 'Baldwin Park',
    'Bel Air': 'Los Angeles',
    'Bell': 'Bell',
    'Bell Gardens': 'Bell Gardens',
    'Bellflower': 'Bellflower',
    'Belmont Shore': 'Long Beach',
    'Beverly Grove': 'Los Angeles',
    'Beverly Hills': 'Beverly Hills',
    'Bradbury': 'Bradbury',
    'Buena Park': 'Buena Park',
    'Burbank': 'Burbank',
    'Calabasas': 'Calabasas',
    'Canoga Park': 'Los Angeles',
    'Canyon Country': 'Santa Clarita',
    'Carson': 'Carson',
    'Castaic': 'Castaic',
    'Cerritos': 'Cerritos',
    'Chatsworth': 'Los Angeles',
    'City Of Commerce': 'Commerce',
    'Claremont': 'Claremont',
    'Compton': 'Compton',
    'Corona': 'Corona',
    'Coronado': 'Coronado',
    'Covina': 'Covina',
    'Culver City': 'Culver City',
    'Cypress': 'Cypress',
    'Del Aire': 'Del Aire',
    'Diamond Bar': 'Diamond Bar',
    'Downey': 'Downey',
    'Downtown Los Angeles': 'Los Angeles',
    'Duarte': 'Duarte',
    'ENCINO': 'Los Angeles',
    'East Los Angeles': 'Los Angeles',
    'Echo Park': 'Los Angeles',
    'El Monte': 'El Monte',
    'El Segundo': 'El Segundo',
    'Encino': 'Los Angeles',
    'GRANADA HILLS': 'Los Angeles',
    'Gardena': 'Gardena',
    'Glendale': 'Glendale',
    'Glendora': 'Glendora',
    'Granada Hills': 'Los Angeles',
    'HARBOR CITY': 'Los Angeles',
    'Hacienda Heights': 'Hacienda Heights',
    'Hawaiian Gardens': 'Hawaiian Gardens',
    'Hawthorne': 'Hawthorne',
    'Hermosa Beach': 'Hermosa Beach',
    'Hollywood Hills': 'Los Angeles',
    'Huntington Park': 'Huntington Park',
    'Inglewood': 'Inglewood',
    'Irvine': 'Irvine',
    'Irwindale': 'Irwindale',
    'LA': 'Los Angeles',
    'LAWNDALE/REDONDO BEACH AREA': 'Lawndale or Redondo Beach',
    'LONG BEACH': 'Long Beach',
    'La Crescenta-Montrose': 'La Crescenta-Montrose',
    'La Habra': 'La Habra',
    'La Habra Heights': 'La Habra Heights',
    'La Mirada': 'La Mirada',
    'La Puente': 'La Puente',
    'La Verne': 'La Verne',
    'Ladera Heights': 'Ladera Heights',
    'Lake Balboa': 'Los Angeles',
    'Lake Hughes': 'Los Angeles',
    'Lake Los Angeles': 'Los Angeles',
    'Lakeview Terrace': 'Los Angeles',
    'Lakewood': 'Lakewood',
    'Lancaster': 'Lancaster',
    'Lawndale': 'Lawndale',
    'Lennox': 'Lennox',
    'Littlerock': 'Littlerock',
    'Lomita': 'Lomita',
    'Los Angeles County': 'Los Angeles',
    'Lynwood': 'Lynwood',
    'MONTROSE': 'Montrose',
    'Malibu': 'Malibu',
    'Malibu Beach': 'Malibu',
    'Manhattan Beach': 'Manhattan Beach',
    'Mar Vista': 'Los Angeles',
    'Marina Del Rey': 'Marina Del Rey',
    'Monrovia': 'Monrovia',
    'Montebello': 'Montebello',
    'Monterey Hill. Los Angeles': 'Los Angeles',
    'Monterey Park': 'Monterey Park',
    'Montrose': 'Montrose',
    'N Hollywood': 'Los Angeles',
    'N. HOLLYWOOD': 'Los Angeles',
    'NORTH HOLLYWOOD': 'Los Angeles',
    'Newhall': 'Newhall',
    'No Hollywood': 'Los Angeles',
    'North Hills': 'Los Angeles',
    'North Hollywood/Burbank': 'Los Angeles',
    'Northridge': 'Los Angeles',
    'Norwalk': 'Norwalk',
    'Oak Park': 'Oak Park',
    'PACIFIC PALISADES': 'Pacific Palisades',
    'Pacific Palisades': 'Pacific Palisades',
    'Pacific Plsds': 'Pacific Palisades',
    'Palmdale': 'Palmdale',
    'Palos Verdes Estates': 'Palos Verdes Estates',
    'Palos Verdes Peninsula': 'Palos Verdes Peninsula',
    'Paramount': 'Paramount',
    'Pasadena': 'Pasadena',
    'Phillips Ranch': 'Phillips Ranch',
    'Pico Rivera': 'Pico Rivera',
    'Playa Del Rey': 'Los Angeles',
    'Playa Vista': 'Los Angeles',
    'Pomona': 'Pomona',
    'Porter Ranch': 'Los Angeles',
    'Quartz Hill': 'Quartz Hill',
    'Rancho Palos Verdes': 'Rancho Palos Verdes',
    'Redondo Beach': 'Redondo Beach',
    'Reseda': 'Los Angeles',
    'Rolling Hills': 'Rolling Hills',
    'Rolling Hills Estates': 'Rolling Hills Estates',
    'Rosemead': 'Rosemead',
    'Rowland Heights': 'Rowland Heights',
    'San Dimas': 'San Dimas',
    'San Fernando': 'San Fernando',
    'San Fernando Valley': 'Los Angeles',
    'San Gabriel': 'San Gabriel',
    'San Marino': 'San Marino',
    'San Pedro': 'Los Angeles',
    'Santa Barbara': 'Santa Barbara',
    'Santa Clarita': 'Santa Clarita',
    'Santa Fe Springs': 'Santa Fe Springs',
    'Santa Monica': 'Santa Monica',
    'Select Cities': 'Unspecified',
    'Shadow Hills': 'Los Angeles',
    'Sherman Oaks': 'Los Angeles',
    'Sherwood Forest': 'Los Angeles',
    'Sierra Madre': 'Sierra Madre',
    'Signal Hill': 'Signal Hill',
    'Silver Lake': 'Los Angeles',
    'South El Monte': 'South El Monte',
    'South Gate': 'South Gate',
    'South Pasadena': 'South Pasadena',
    'Stevenson Ranch': 'Stevenson Ranch',
    'Studio City': 'Los Angeles',
    'Sun Valley': 'Los Angeles',
    'Sunland': 'Los Angeles',
    'Sylmar': 'Los Angeles',
    'TARZANA': 'Los Angeles',
    'Temple City': 'Temple City',
    'Thousand Oaks': 'Thousand Oaks',
    'Toluca Lake': 'Los Angeles',
    'Toluca Terrace': 'Los Angeles',
    'Topanga': 'Topanga',
    'Topanga Canyon': 'Topanga',
    'Torrance': 'Torrance',
    'Tujunga': 'Los Angeles',
    'Universal City': 'Los Angeles',
    'Upland': 'Upland',
    'VALLEY VILLAGE': 'Los Angeles',
    'VENICE': 'Los Angeles',
    'VENICE BEACH': 'Los Angeles',
    'Val Verde': 'Val Verde',
    'Valencia': 'Valencia',
    'Valley Glen': 'Los Angeles',
    'Valley Village': 'Los Angeles',
    'Valyermo': 'Valyermo',
    'Van Nuys': 'Los Angeles',
    'Venice Beach': 'Los Angeles',
    'View Park': 'Los Angeles',
    'View Park-Windsor Hills': 'Los Angeles',
    'WEST HOLLYWOOD HILLS': 'Los Angeles',
    'Walnut': 'Walnut',
    'West Covina': 'West Covina',
    'West Hills': 'Los Angeles',
    'West Hollywood': 'West Hollywood',
    'West Los Angeles': 'Los Angeles',
    'Westchester': 'Westchester',
    'Westlake Village': 'Westlake Village',
    'Westminster': 'Westminster',
    'Whittier': 'Whittier',
    'Wilmington': 'Wilmington',
    'Windsor Hills': 'Los Angeles',
    'Winnetka': 'Winnetka',
    'Woodland Hills': 'Los Angeles',
    'Wrightwood': 'Wrightwood'
}


In [7]:
#Replacing target variable with new target with less classes
df['mapped_city'] = df['city'].map(city_mapping)


In [8]:
len(set(df['mapped_city']))

120

## Cleaning the data

In [9]:
#Copy amenities column
amenities_copy = amenities.copy()
amenities_copy['amenities'] = amenities_copy['amenities'].astype(str)
amenities_copy['amenities'] = amenities_copy['amenities'].str.replace('{', '').str.replace('}', '').str.replace('"', '').str.split(',')

#Get column Names for each Amenity
unique_amenities = set()

for index, row in amenities_copy.iterrows():
    amenities_list = row['amenities']
    for amenity in amenities_list:
        if amenity != '' and 'translation missing' not in amenity:
            unique_amenities.add(amenity)

rows = []

# Create Dummy Values per Amenity
for index, row in amenities_copy.iterrows():
    row_data = {}
    amenities_list = row['amenities']
    for amenity in unique_amenities:
        if amenity in amenities_list:
            row_data[amenity] = 1
        else:
            row_data[amenity] = 0
    rows.append(row_data)

amenities_df = pd.DataFrame(rows)


One-hot-encoding categorical variables

In [10]:
#One hot encoding the categorical variables
one_hot_encoded_data = pd.get_dummies(df, columns = ['property_type','room_type','bed_type','cancellation_policy'])


Cleaning numeric data

In [11]:
#Function for cleaning the numeric data
def clean_price_column(df, column_name):
    #Removes the $ sign and the , and converting it to a integer from string
    df[column_name] = df[column_name].str.replace('$', '').str.replace(',', '').str.strip()
    df[column_name] = df[column_name].astype(float).astype(int)
    
    return df

In [12]:
clean_price_column(one_hot_encoded_data, 'price')

Unnamed: 0,id,host_response_rate,host_total_listings_count,street,city,zipcode,accommodates,bathrooms,bedrooms,beds,...,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,16228948,25%,1.0,"Acton, CA 93510, United States",Acton,93510,10,7.0,5.0,5.0,...,False,False,False,False,True,False,False,True,False,False
1,8909380,100%,1.0,"Palmdale, CA 93550, United States",Palmdale,93550,2,1.0,1.0,1.0,...,False,False,False,False,True,True,False,False,False,False
2,14078522,100%,1.0,"Acton, CA 93510, United States",Acton,93510,2,1.0,1.0,1.0,...,False,False,False,False,True,True,False,False,False,False
3,13006928,100%,1.0,"Acton, CA 93510, United States",Acton,93510,6,1.0,1.0,3.0,...,False,False,False,False,True,True,False,False,False,False
4,7898757,,1.0,"West Adams, Los Angeles, CA 90007, United States",Los Angeles,90007,1,1.0,1.0,1.0,...,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31184,18445825,100%,1.0,"La Habra, La Habra, CA 90631, United States",La Habra,90631,2,1.0,1.0,1.0,...,False,False,False,False,True,True,False,False,False,False
31185,18192212,100%,4.0,"Malibu, Malibu, CA 90265, United States",Malibu,90265,2,1.0,1.0,1.0,...,False,False,False,False,True,False,False,True,False,False
31186,17339165,75%,1.0,"Malibu, Malibu, CA 90265, United States",Malibu,90265,10,5.0,5.0,5.0,...,False,False,False,False,True,True,False,False,False,False
31187,17827299,100%,1.0,"La Habra, CA 90631, United States",La Habra,90631,4,1.0,2.0,2.0,...,False,False,False,False,True,False,True,False,False,False


In [13]:
#Merging the Encoded data with the Encoded amenities data
merged_df = pd.concat([one_hot_encoded_data, amenities_df],axis=1)


In [14]:
#Dropping NAs from target variable as there were only 2 they are negligible 
merged_df = merged_df.dropna(subset=['city'])

In [15]:
#Looking at the distribution of classes
class_counts = merged_df['mapped_city'].value_counts()
print(class_counts)

mapped_city
Los Angeles       20266
Santa Monica        990
West Hollywood      933
Pasadena            587
Beverly Hills       489
                  ...  
Newhall               1
Westchester           1
Corona                1
Unspecified           1
Wilmington            1
Name: count, Length: 119, dtype: int64


In [16]:
sorted_class_counts = class_counts.sort_values(ascending=False)
#Selecting the top 30 classes to model
top_30_classes = sorted_class_counts.head(30)
top_30_classes

mapped_city
Los Angeles         20266
Santa Monica          990
West Hollywood        933
Pasadena              587
Beverly Hills         489
Malibu                435
Rowland Heights       343
Glendale              315
Redondo Beach         312
Burbank               263
Culver City           255
Arcadia               238
Topanga               193
Manhattan Beach       190
Inglewood             178
Torrance              161
Altadena              159
Alhambra              150
San Gabriel           145
Hermosa Beach         136
Monterey Park         126
Walnut                125
Gardena                93
Santa Clarita          93
Pomona                 91
Temple City            90
El Segundo             88
Hacienda Heights       85
Hawthorne              82
Whittier               77
Name: count, dtype: int64

In [17]:
top_30_classes = top_30_classes.index

merged_30 = merged_df[merged_df['mapped_city'].isin(top_30_classes)]
merged_30

Unnamed: 0,id,host_response_rate,host_total_listings_count,street,city,zipcode,accommodates,bathrooms,bedrooms,beds,...,Pets live on this property,Wheelchair accessible,Smartlock,Wide hallway clearance,Heating,Game console,Essentials,Babysitter recommendations,Hot tub,Free parking on street
0,16228948,25%,1.0,"Acton, CA 93510, United States",Acton,93510,10,7.0,5.0,5.0,...,0,0,0,0,1,0,1,0,1,0
2,14078522,100%,1.0,"Acton, CA 93510, United States",Acton,93510,2,1.0,1.0,1.0,...,1,1,0,0,1,0,1,0,0,0
3,13006928,100%,1.0,"Acton, CA 93510, United States",Acton,93510,6,1.0,1.0,3.0,...,1,0,0,0,1,0,1,0,1,0
4,7898757,,1.0,"West Adams, Los Angeles, CA 90007, United States",Los Angeles,90007,1,1.0,1.0,1.0,...,0,0,0,0,0,0,1,0,0,0
5,11254337,100%,1.0,"West Adams, Los Angeles, CA 90007, United States",Los Angeles,90007,1,1.0,1.0,1.0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31181,18162696,93%,1.0,"Malibu, Malibu, CA 90265, United States",Malibu,90265,4,1.0,1.0,2.0,...,0,1,0,0,1,0,1,0,0,0
31182,18506757,,1.0,"Malibu, Malibu, CA 90265, United States",Malibu,90265,6,2.0,3.0,3.0,...,0,0,0,0,1,0,1,0,0,0
31185,18192212,100%,4.0,"Malibu, Malibu, CA 90265, United States",Malibu,90265,2,1.0,1.0,1.0,...,0,0,0,0,0,0,1,0,0,0
31186,17339165,75%,1.0,"Malibu, Malibu, CA 90265, United States",Malibu,90265,10,5.0,5.0,5.0,...,0,0,0,0,1,0,1,0,0,0


In [18]:
#Creating Target Variable
Y = merged_30['mapped_city']
columns_to_drop = ['mapped_city','amenities','id','host_response_rate','street','zipcode', 'city','weekly_price','monthly_price','security_deposit','cleaning_fee','extra_people','reviews_per_month']
#Creating predictor variables
X = merged_30.drop(columns_to_drop, axis=1) 

In [19]:
#Filling NAs for X
X = X.fillna(0)

In [20]:
#Encoding the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(Y)

In [21]:
#Standardizing the variavbles
scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(X)

In [22]:
#Splitting into test and train
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.4, random_state=42)

In [23]:
#Selecting the best variables using Lasso
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

lasso = Lasso(alpha=0.01)  

lasso.fit(X_train, y_train)

selected_features = lasso.coef_ != 0
print("Number of selected features:", sum(selected_features))

X_train[:, selected_features]


Number of selected features: 99


array([[ 0.71785711, -0.57747661, -0.45392158, ..., -0.07454237,
        -0.45615856, -0.03608178],
       [ 0.35571264,  0.23284026, -0.45392158, ..., -0.07454237,
        -0.45615856, -0.03608178],
       [ 0.03883623, -0.57747661, -0.45392158, ..., -0.07454237,
        -0.45615856, -0.03608178],
       ...,
       [-0.18750407,  0.23284026, -0.45392158, ..., -0.07454237,
        -0.45615856, -0.03608178],
       [-0.05169989, -0.57747661, -0.45392158, ..., -0.07454237,
        -0.45615856, -0.03608178],
       [-0.18750407, -0.57747661, -0.45392158, ..., -0.07454237,
        -0.45615856, -0.03608178]])

In [24]:
selected_feature_names = X.columns[selected_features]
print("Selected feature names:")
print(selected_feature_names)

Selected feature names:
Index(['host_total_listings_count', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'guests_included', 'minimum_nights',
       'number_of_reviews', 'review_scores_accuracy', 'review_scores_location',
       'property_type_Apartment', 'property_type_Bed & Breakfast',
       'property_type_Boat', 'property_type_Boutique hotel',
       'property_type_Cabin', 'property_type_Camper/RV',
       'property_type_Castle', 'property_type_Cave', 'property_type_Chalet',
       'property_type_Condominium', 'property_type_Dorm',
       'property_type_Guest suite', 'property_type_Guesthouse',
       'property_type_Hostel', 'property_type_House', 'property_type_Hut',
       'property_type_Loft', 'property_type_Other',
       'property_type_Serviced apartment', 'property_type_Tipi',
       'property_type_Townhouse', 'property_type_Treehouse',
       'property_type_Vacation home', 'property_type_Villa',
       'property_type_Yurt', 'room_type_Entire home/apt',
  

In [25]:
#Training first SVM Model
svm_model = SVC(kernel='linear',C=0.1,gamma=0.4)  
svm_model.fit(X_train[:, selected_features], y_train)

In [26]:
#Evaluating the model
y_test_pred = svm_model.predict(X_test[:, selected_features])

In [27]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        60
           1       0.00      0.00      0.00        69
           2       0.33      0.02      0.04        84
           3       0.00      0.00      0.00       190
           4       0.00      0.00      0.00       108
           5       0.00      0.00      0.00       111
           6       0.00      0.00      0.00        42
           7       0.00      0.00      0.00        33
           8       0.00      0.00      0.00       129
           9       0.00      0.00      0.00        37
          10       0.00      0.00      0.00        33
          11       0.00      0.00      0.00        61
          12       0.53      0.14      0.22        70
          13       0.73      1.00      0.85      8122
          14       0.67      0.01      0.02       179
          15       0.00      0.00      0.00        75
          16       0.00      0.00      0.00        56
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
sorted_class_counts = class_counts.sort_values(ascending=False)
#Selecting the top 10 classes to model
top_10_classes = sorted_class_counts.head(10)
top_10_classes
top_10_classes = top_10_classes.index

merged_10 = merged_df[merged_df['mapped_city'].isin(top_10_classes)]
merged_10
#Creating Target Variable
Y = merged_10['mapped_city']
columns_to_drop = ['mapped_city','amenities','id','host_response_rate','street','zipcode', 'city','weekly_price','monthly_price','security_deposit','cleaning_fee','extra_people','reviews_per_month']
#Creating predictor variables
X = merged_10.drop(columns_to_drop, axis=1) 
#Filling NAs for 
X = X.fillna(0)
#Encoding the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(Y)
#Standardizing the variavbles
scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(X)
#Splitting into test and train
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.4, random_state=42)

In [29]:
#Using SMOTE to balance out the dataset
from collections import Counter

class_counts = Counter(y_train)

majority_class_count = max(class_counts.values())

desired_count = int(1 * majority_class_count)

sampling_strategy = {cls: desired_count for cls in class_counts}

sampling_strategy[max(class_counts, key=class_counts.get)] = majority_class_count


smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [30]:
#Selecting best variables from SMOTE dataset
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

lasso = Lasso(alpha=0.1) 

lasso.fit(X_train_resampled, y_train_resampled)

selected_features = lasso.coef_ != 0
print("Number of selected features:", sum(selected_features))

X_train_resampled[:,selected_features]


Number of selected features: 26


array([[-0.4543429 , -0.20868223, -0.05709024, ...,  0.3817923 ,
        -2.47873477, -0.07487453],
       [-0.4543429 , -0.2534731 , -0.05709024, ...,  0.3817923 ,
         0.40343163, -0.07487453],
       [-0.4543429 , -0.13029819, -0.05709024, ...,  0.3817923 ,
         0.40343163, -0.07487453],
       ...,
       [-0.24189597,  0.04453325, -0.05709024, ...,  0.3817923 ,
         0.40343163, -0.07487453],
       [-0.4543429 , -0.23702109, -0.05709024, ..., -0.54448272,
         0.40343163, -0.07487453],
       [ 0.71737117, -0.14022974, -0.05709024, ...,  0.3817923 ,
         0.40343163, -0.07487453]])

In [31]:
selected_feature_names = X.columns[selected_features]
print("Selected feature names:")
print(selected_feature_names)

Selected feature names:
Index(['bathrooms', 'price', 'property_type_Camper/RV',
       'property_type_Guesthouse', 'property_type_House',
       'property_type_Villa', 'room_type_Entire home/apt',
       'room_type_Private room', 'Shampoo', 'Internet', 'Kitchen',
       'Carbon monoxide detector', 'Dog(s)', 'Laptop friendly workspace',
       'Washer', 'Air conditioning', 'Buzzer/wireless intercom',
       'Private living room', 'Hair dryer', 'Other pet(s)', 'Doorman',
       'First aid kit', 'Pets live on this property', 'Heating', 'Essentials',
       'Babysitter recommendations'],
      dtype='object')


In [32]:
#Checking the size of the training data
X_train_resampled[:,selected_features].size

3160820

In [33]:
#SMOTE SVM Model
svm_model = SVC(kernel='linear',C=0.1)  
svm_model.fit(X_train_resampled[:, selected_features], y_train_resampled)

In [34]:
#Evaluating the model
y_test_pred = svm_model.predict(X_test[:, selected_features])
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.06      0.31      0.11       176
           1       0.02      0.17      0.03       106
           2       0.03      0.26      0.06       137
           3       0.91      0.06      0.11      8109
           4       0.16      0.39      0.23       168
           5       0.05      0.15      0.07       240
           6       0.04      0.40      0.07       117
           7       0.08      0.52      0.14       141
           8       0.11      0.33      0.16       416
           9       0.07      0.40      0.12       364

    accuracy                           0.11      9974
   macro avg       0.15      0.30      0.11      9974
weighted avg       0.75      0.11      0.11      9974



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create Decision Tree classifier
clf = DecisionTreeClassifier()

# Initialize Grid Search Cross-Validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit Grid Search to training data
grid_search.fit(X_train_resampled[:, selected_features], y_train_resampled)

# Get best parameters and best score
best_params = grid_search.best_params_

print("Best parameters:", best_params)
print("Best accuracy:", best_score)

In [69]:
# Best parameters found during grid search
best_params = {
    'criterion': 'entropy',
    'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2
}

# Create Decision Tree classifier with best parameters
best_clf = DecisionTreeClassifier(**best_params)

# Train the classifier on the training data
best_clf.fit(X_train_resampled[:, selected_features], y_train_resampled)

In [71]:
# Use the trained classifier to make predictions on the test data
y_pred = best_clf.predict(X_test[:, selected_features])

# Calculate accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.08      0.14      0.10       176
           1       0.06      0.08      0.06       106
           2       0.07      0.09      0.08       137
           3       0.84      0.77      0.80      8109
           4       0.17      0.22      0.19       168
           5       0.07      0.11      0.08       240
           6       0.08      0.11      0.09       117
           7       0.30      0.35      0.33       141
           8       0.15      0.19      0.16       416
           9       0.08      0.10      0.09       364

    accuracy                           0.65      9974
   macro avg       0.19      0.21      0.20      9974
weighted avg       0.71      0.65      0.68      9974



In [72]:
unique_classes, class_counts = np.unique(y_train_resampled, return_counts=True)

#Count the occurrences of each class
print("Class Counts:")
for cls, count in zip(unique_classes, class_counts):
    print(f"Class {cls}: {count} instances")


Class Counts:
Class 0: 12157 instances
Class 1: 12157 instances
Class 2: 12157 instances
Class 3: 12157 instances
Class 4: 12157 instances
Class 5: 12157 instances
Class 6: 12157 instances
Class 7: 12157 instances
Class 8: 12157 instances
Class 9: 12157 instances


In [28]:
# Looking at the distribution of classes
class_counts = merged_df['mapped_city'].value_counts()
sorted_class_counts = class_counts.sort_values(ascending=False)

# Select the top N classes
N = 10
top_N_classes = sorted_class_counts.head(N + 1).index

# Remove the first class
top_N_classes = top_N_classes[1:]

# Remove rows corresponding to the top N classes from filtered_df
filtered_df = merged_df[~merged_df['mapped_city'].isin(top_N_classes)]

# Filter merged_df to include only the top 10 classes
merged_10 = merged_df[merged_df['mapped_city'].isin(top_N_classes)]
top_N_classes

Index(['Santa Monica', 'West Hollywood', 'Pasadena', 'Beverly Hills', 'Malibu',
       'Rowland Heights', 'Glendale', 'Redondo Beach', 'Burbank',
       'Culver City'],
      dtype='object')

In [29]:
#Creating Target Variable
Y = merged_10['mapped_city']
columns_to_drop = ['mapped_city','amenities','id','host_response_rate','street','zipcode', 'city','weekly_price','monthly_price','security_deposit','cleaning_fee','extra_people','reviews_per_month']
#Creating predictor variables
X = merged_10.drop(columns_to_drop, axis=1) 
#Filling NAs for X
X = X.fillna(0)
#Encoding the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(Y)
#Standardizing the variavbles
scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(X)
#Splitting into test and train
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.4, random_state=42)

In [30]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

lasso = Lasso(alpha=0.1) 

lasso.fit(X_train, y_train)

selected_features = lasso.coef_ != 0
print("Number of selected features:", sum(selected_features))

X_train[:,selected_features]


Number of selected features: 30


array([[-0.46264012, -0.20603395, -0.33770486, ..., -0.33476121,
         0.35359379, -0.44873918],
       [-0.46264012, -0.10484011, -0.33770486, ..., -0.33476121,
         0.35359379, -0.44873918],
       [ 1.05644905,  0.03768642,  0.02432142, ..., -0.33476121,
         0.35359379, -0.44873918],
       ...,
       [-0.46264012, -0.24736664, -0.33770486, ...,  2.98720389,
         0.35359379, -0.44873918],
       [-0.46264012, -0.23311399, -0.09635401, ..., -0.33476121,
         0.35359379, -0.44873918],
       [-0.46264012, -0.24736664, -0.33770486, ..., -0.33476121,
        -2.82810393, -0.44873918]])

In [31]:
selected_feature_names = X.columns[selected_features]
print("Selected feature names:")
print(selected_feature_names)

Selected feature names:
Index(['bathrooms', 'price', 'minimum_nights', 'number_of_reviews',
       'property_type_Apartment', 'property_type_Camper/RV',
       'property_type_Guesthouse', 'property_type_House',
       'room_type_Shared room', 'bed_type_Futon', 'bed_type_Real Bed',
       'cancellation_policy_strict', 'Elevator in building',
       'Indoor fireplace', 'Air conditioning', 'Free parking on street',
       'Family/kid friendly', 'Fire extinguisher', 'Self Check-In', 'Dog(s)',
       'Essentials', 'TV', 'Other pet(s)', 'Pets live on this property',
       'Hair dryer', 'Hot tub', 'Laptop friendly workspace',
       'Buzzer/wireless intercom', 'Kitchen', 'Safety card'],
      dtype='object')


In [30]:
#SVM Model for Top 10 without LA
svm_model = SVC(kernel='linear',C=0.5)  
svm_model.fit(X_train[:, selected_features], y_train)

In [37]:
y_test_pred = svm_model.predict(X_test[:, selected_features])
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.33      0.22      0.26       208
           1       0.25      0.01      0.02       108
           2       0.10      0.02      0.04        89
           3       0.27      0.13      0.17       131
           4       0.53      0.35      0.42       179
           5       0.27      0.37      0.32       230
           6       0.37      0.14      0.21       118
           7       0.42      0.64      0.51       129
           8       0.43      0.48      0.45       384
           9       0.39      0.62      0.48       393

    accuracy                           0.38      1969
   macro avg       0.33      0.30      0.29      1969
weighted avg       0.36      0.38      0.35      1969



In [32]:
unique_classes, class_counts = np.unique(y_train, return_counts=True)

#Count the occurrences of each class
print("Class Counts:")
for cls, count in zip(unique_classes, class_counts):
    print(f"Class {cls}: {count} instances")


Class Counts:
Class 0: 281 instances
Class 1: 155 instances
Class 2: 166 instances
Class 3: 184 instances
Class 4: 256 instances
Class 5: 357 instances
Class 6: 194 instances
Class 7: 214 instances
Class 8: 606 instances
Class 9: 540 instances


In [35]:
from sklearn.tree import DecisionTreeClassifier
best_params_gini = {
    'criterion': 'gini',
    'max_depth': 10,
    'min_samples_leaf': 4,
    'min_samples_split': 10
}

# Create Decision Tree classifier with best parameters
best_clf_gini = DecisionTreeClassifier(**best_params_gini)

# Train the classifier on the training data
best_clf_gini.fit(X_train[:, selected_features], y_train)


In [36]:
from sklearn.metrics import accuracy_score
y_pred = best_clf_gini.predict(X_test[:, selected_features])

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.25      0.19      0.22       208
           1       0.17      0.07      0.10       108
           2       0.15      0.09      0.11        89
           3       0.31      0.11      0.16       131
           4       0.45      0.39      0.42       179
           5       0.22      0.27      0.24       230
           6       0.41      0.27      0.32       118
           7       0.56      0.67      0.61       129
           8       0.41      0.49      0.45       384
           9       0.35      0.48      0.40       393

    accuracy                           0.35      1969
   macro avg       0.33      0.30      0.30      1969
weighted avg       0.34      0.35      0.34      1969



In [104]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create Decision Tree classifier
clf = DecisionTreeClassifier()

# Initialize Grid Search Cross-Validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit Grid Search to training data
grid_search.fit(X_train, y_train)

# Get best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters:", best_params)
print("Best accuracy:", best_score)

Best parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best accuracy: 0.38705440362499643


In [38]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Initialize Decision Tree classifier with best parameters
best_params_decision_tree  = {
    'criterion': 'gini',
    'max_depth': 10,
    'min_samples_leaf': 4,
    'min_samples_split': 10
}

decision_tree = DecisionTreeClassifier(**best_params_decision_tree)

# Initialize SVM classifier with best parameters
best_params_svm = {
    'C': 1.0,
    'kernel': 'linear'
}
svm_classifier = SVC(**best_params_svm)

# Create VotingClassifier with estimators (Decision Tree and SVM)
ensemble_clf = VotingClassifier(estimators=[('dt', decision_tree), ('svm', svm_classifier)], voting='hard')

# Train the ensemble classifier on the training data
ensemble_clf.fit(X_train[:, selected_features], y_train)


In [39]:
y_pred_ensemble = ensemble_clf.predict(X_test[:, selected_features])
print(classification_report(y_test, y_pred_ensemble))

              precision    recall  f1-score   support

           0       0.26      0.27      0.27       208
           1       0.19      0.08      0.12       108
           2       0.13      0.10      0.11        89
           3       0.26      0.16      0.20       131
           4       0.47      0.42      0.45       179
           5       0.23      0.38      0.29       230
           6       0.36      0.24      0.29       118
           7       0.52      0.59      0.55       129
           8       0.46      0.45      0.46       384
           9       0.40      0.42      0.41       393

    accuracy                           0.36      1969
   macro avg       0.33      0.31      0.31      1969
weighted avg       0.36      0.36      0.35      1969

