In [None]:
import re
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, pipeline

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from tqdm import tqdm

In [None]:


model_name = "cardiffnlp/twitter-roberta-base-sentiment"

classifier = pipeline("sentiment-analysis", model=model_name, device=0)


label_map = {
    'LABEL_0': 'NEGATIVE',
    'LABEL_1': 'NEUTRAL',
    'LABEL_2': 'POSITIVE'
}

In [None]:
def remove_html(text):
    clean = re.sub(r'<[^>]+>', ' ', text)
    return re.sub(r'\s+', ' ', clean).strip()

def clean_review(text):
    if pd.isna(text):
        return None
    text = remove_html(text)
    return text

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "error"


def classify_sentiment(text):
    try:
        return label_map[classifier(text[:514])[0]['label']]
    except Exception as e:
        print(f"Error processing: {text[:60]}... -> {e}")
        return "error"

2. **Preprocessing**  
   Raw review data is loaded and cleaned:
   - Read the CSV file: `data/reviews.csv`.
   - Remove entries with missing comments.
   - Apply `clean_review` to clean the review.
   - Filter out reviews that are not written in English using `detect_language`.

In [None]:
df = pd.read_csv("data/reviews.csv")
df.dropna(subset=['comments'], inplace=True)

df['comments'] = df['comments'].apply(clean_review)

df = df[df['comments'].apply(detect_language) == 'en']

3. **Sentiment Analysis**  
   The cleaned and filtered English reviews are passed in batches to a Hugging Face transformer pipeline running on GPU to classify each review's sentiment.

In [None]:
texts = df['comments'].str[:514].tolist()
batch_size = 32
results = []

for i in tqdm(range(0, len(texts), batch_size), desc="Classifying"):
    batch = texts[i:i + batch_size]
    try:
        batch_results = classifier(batch)
        batch_labels = [label_map.get(result['label'], 'unknown') for result in batch_results]
    except Exception as e:
        print(f"Error processing batch {i//batch_size}: {e}")
        batch_labels = ['error'] * len(batch)
    results.extend(batch_labels)

df['sentiment'] = results

4. **Saving Results**  
   The final dataset, including predicted sentiment labels, is saved. Additional filtering can be done to extract only positive or negative reviews for further analysis.

In [None]:
df[df['sentiment'] != 'NEUTRAL'].to_csv('data/sentiment_reviews.csv', index=False)
df[df['sentiment'] == 'NEGATIVE'][['comments', 'sentiment']].to_csv('data/negative_reviews.csv', index=False)
df[df['sentiment'] == 'POSITIVE'][['comments', 'sentiment']].to_csv('data/positive_reviews.csv', index=False)

print("Sentiment classification completed and saved to sentiment_reviews.csv")
print("Negative reviews saved to negative_reviews.csv size:", len(df[df['sentiment'] == 'NEGATIVE'].index))
print("Positive reviews saved to positive_reviews.csv size:", len(df[df['sentiment'] == 'POSITIVE'].index))

In [52]:
df_listing = pd.read_csv("data/listings.csv.gz", low_memory=False)

In [48]:
print(df_listing.shape)
print(df_listing.columns)

(5223, 75)
Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_av

In [49]:
# Missing values in df

print(df_listing.isnull().sum())

id                                                 0
listing_url                                        0
scrape_id                                          0
last_scraped                                       0
source                                             0
                                                ... 
calculated_host_listings_count                     0
calculated_host_listings_count_entire_homes        0
calculated_host_listings_count_private_rooms       0
calculated_host_listings_count_shared_rooms        0
reviews_per_month                               1063
Length: 75, dtype: int64


In [50]:
def parse_bathroom_count(text):
    if pd.isna(text):
        return np.nan
    text = str(text).lower().strip()
    
    if "half" in text:
        return 0.5
    match = re.search(r"([\d\.]+)", text)
    if match:
        return float(match.group(1))
    return np.nan


In [None]:
# Clean listing

# Only use useful columns
columns_to_keep = [
    'price',
    'neighbourhood_cleansed',
    'room_type',
    'bedrooms',
    'bathrooms_text',
    'accommodates',
    'amenities',
    'minimum_nights',
    'number_of_reviews',
    'review_scores_rating',
    'name',
    'description'
]
print(df_listing["price"].unique())
print(df_listing["bathrooms_text"].unique())

df_listing = df_listing[columns_to_keep].copy()
# Logic to remove rows with missing important columns?

# Price
df_listing["price"] = df_listing["price"].replace('[\$,]', '', regex=True).astype(float)
# Convert bathroom text to numerical
df_listing['bathrooms'] = df_listing['bathrooms_text'].apply(parse_bathroom_count)
print(df_listing.columns)
df_listing.drop(["bathrooms_text"], axis=1, inplace=True)

print(df_listing["price"].unique())
print(df_listing["bathrooms"].unique())

['$944.00' '$414.00' '$1,320.00' ... '$484.00' '$1,820.00' '$20,000.00']
['1 shared bath' '1 bath' '2 baths' '1.5 baths' '2 shared baths'
 'Shared half-bath' '2.5 baths' '1.5 shared baths' nan '3 baths'
 '1 private bath' '3.5 baths' 'Half-bath' '2.5 shared baths'
 '0 shared baths' '3 shared baths' '4 shared baths' '4.5 baths' '0 baths'
 'Private half-bath' '4 baths' '7.5 shared baths' '9 shared baths'
 '12 baths' '6 shared baths' '6 baths' '5.5 baths']
9
Index(['price', 'neighbourhood_cleansed', 'room_type', 'bedrooms',
       'bathrooms_text', 'accommodates', 'amenities', 'minimum_nights',
       'number_of_reviews', 'review_scores_rating', 'name', 'description',
       'bathrooms'],
      dtype='object')
[  944.   414.  1320. ...   484.  1820. 20000.]
[ 1.   2.   1.5  0.5  2.5  nan  3.   3.5  0.   4.   4.5  7.5  9.  12.
  6.   5.5]
