**Loading the libray**

In [None]:
import os
import spacy
import pandas as pd
import numpy as np
import geopandas as gpd
import re
import math
import string
import unicodedata
import gensim
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
from joblib import dump
from joblib import load
import contextily as ctx
import urllib.request

from scipy.spatial.distance import cdist

from shapely.geometry import Point

from sklearn.preprocessing import OneHotEncoder  # We don't use this but I point out where you *could*
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk import ngrams, FreqDist

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.matutils import corpus2dense
from gensim.models import tfidfmodel
from gensim.models import Word2Vec
from gensim.models import TfidfModel
from gensim.models import KeyedVectors
from gensim.models.ldamodel import LdaModel

from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS

# Import everthing from textual/__init__.py
# Including bunch of tools and functions we could use for NLP 
from textual import *

**Loading the Data**

In [None]:
# Download and read the csv file remotely from url
host = 'http://data.insideairbnb.com'
path = 'united-kingdom/england/london/2023-09-06/data'
file = 'listings.csv.gz'
url  = f'{host}/{path}/{file}'

# Save csv file
if os.path.exists(file):
  Airbnb_Listing = pd.read_csv(file, compression='gzip', low_memory=False)
else: 
  Airbnb_Listing = pd.read_csv(url, compression='gzip', low_memory=False)
  Airbnb_Listing.to_csv(file)

In [None]:
%%capture --no-stdout
# Prepare for necessary nltk packages
nltk.download('wordnet') # <-- These are done in a supporting tool, but in your own
nltk.download('averaged_perceptron_tagger') # application you'd need to import them
nltk.download('stopwords')
stopword_list = set(stopwords.words('english'))
print("nltk.download successful")

**Select useful columns**

In [None]:
Airbnb_Listing.columns
remained_columns = ['id','description','neighbourhood_cleansed','latitude','longitude',
                    'room_type','amenities','price','number_of_reviews',
                    'review_scores_rating', 'review_scores_accuracy','review_scores_cleanliness', 
                    'review_scores_checkin','review_scores_communication', 'review_scores_location','review_scores_value',
                    'reviews_per_month']
Airbnb_Listing = Airbnb_Listing[remained_columns]

**Pre-processing data for normalisation**

In [None]:
# import relating operating tools to normalise 'description' and 'amenities' from Jon Reades codes


host  = 'https://orca.casa.ucl.ac.uk'
turl  = f'{host}/~jreades/__textual__.py'
tdirs = os.path.join('textual')
tpath = os.path.join(tdirs,'__init__.py')

if not os.path.exists(tpath):
    os.makedirs(tdirs, exist_ok=True)
    urllib.request.urlretrieve(turl, tpath)

*Drop NAs*

In [None]:
# Drop NAs of columns ['description','amenities']
Airbnb_Listing = Airbnb_Listing.dropna(subset=['description','amenities'])
print(f"Now gdf has {Airbnb_Listing.shape[0]:,} rows.")

codes below should be operated only once, run it again only if updating needed

In [None]:
"""
%%time 
# I get about 21 minutes 
Airbnb_Listing['description_norm'] = Airbnb_Listing['description'].apply(normalise_document, remove_digits=True)
"""

In [None]:
"""
%%time 
# Codes below should be operated only once, run it again only if undating needed

# I get about 21 minutes 
Airbnb_Listing['amenities_norm'] = Airbnb_Listing['amenities'].apply(normalise_document, remove_digits=True)
"""

**Pre-Processing finished and Saving csv file**

In [None]:
# Codes below should be operated only once, run it again only if undating needed
# Airbnb_Listing.to_csv('./Data/Airbnb_Listing_norm.csv', index=True)