In [1]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=6397480268ca91bb2bf0afdc01c24574b52253b8e8fb0ea5bd43519c9ac463b5
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [8]:
## Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
#from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, TrainingArguments, Trainer  
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import OneHotEncoder
import re  
import string  
from bs4 import BeautifulSoup  
from nltk.corpus import stopwords  
from langdetect import detect  
from sklearn.utils import resample
from tqdm import tqdm

import nltk  
nltk.download('stopwords')  

## Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
## Load listings data  (NYC)
nyc_train = pd.read_csv('/kaggle/input/airbnb-cities-reviews/nyc_train.csv')  

## Load reviews data  (london)
london_train = pd.read_csv('/kaggle/input/airbnb-cities-reviews/london_train.csv')  

## Concat the cities
train_set = pd.concat([nyc_train, london_train], axis=0)

## Display head
train_set.head(3)

Unnamed: 0,listing_id,listing_url,scrape_id,last_scraped,name,neighborhood_overview,host_id,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,...,calculated_host_listings_count,reviews_per_month,city,id,date,reviewer_id,reviewer_name,comments,polarity,polarity_class
0,3283409,https://www.airbnb.com/rooms/3283409,20240705150938,2024-07-05,Beautiful Brooklyn Brownstone,Bed-Stuy is a quiet neighborhood. Tree-lined s...,16593547,193,9,0,...,1,1.61,nyc,53603884,2015-11-10,44268857,Nancy,We had an amazing stay at Mary's and Josh's be...,0.9796,1.0
1,53031902,https://www.airbnb.com/rooms/53031902,20240705150938,2024-07-05,Urban Chic Riverview Corner Suite,"Vibrant neighborhood, with people having fun i...",206778021,37,1,0,...,12,1.13,nyc,775447743059830615,2022-12-05,427701377,Filipe,Thanks Vanessa!!,0.5399,1.0
2,611089534312850631,https://www.airbnb.com/rooms/611089534312850631,20240705150938,2024-07-05,Cozy and stylish ground floor guest suite,Morris park is the SAFEST neighborhood in the ...,19418202,115,25,0,...,1,4.41,nyc,763077599351696920,2022-11-18,38157718,Malvina,J'ai passé une excellent séjour chez Yulia et ...,0.5719,1.0


In [5]:
## Function to downsample the majority class and balance the data  
def balance_data(df, target_column):  
    """  
    Downsample the majority class to balance the dataset.  
      
    Parameters:  
    df (pd.DataFrame): DataFrame containing the data  
    target_column (str): The name of the target column  
      
    Returns:  
    pd.DataFrame: Balanced DataFrame  
    """  
    ## Separate majority and minority classes  
    majority_class = df[df[target_column] == df[target_column].value_counts().idxmax()]  
    minority_class = df[df[target_column] != df[target_column].value_counts().idxmax()]  
      
    ## Downsample majority class  
    majority_class_downsampled = resample(majority_class,  
                                          replace=False,
                                          n_samples=len(minority_class),
                                          random_state=123) 
      
    ## Combine minority class with downsampled majority class  
    balanced_df = pd.concat([minority_class, majority_class_downsampled])  
    return balanced_df 

## Balance the dataset  
balanced_train_set = balance_data(train_set, 'polarity_class')  
print(f"POLARITY CLASS DISTRIBUTION: {balanced_train_set['polarity_class'].value_counts()}")

POLARITY CLASS DISTRIBUTION: polarity_class
0.0    94141
1.0    94141
Name: count, dtype: int64


In [6]:
## Function to preprocess Airbnb reviews  
def preprocess_reviews(reviews):  
    """  
    Preprocess Airbnb reviews by removing HTML tags, stop words (excluding important negators),   
    non-English reviews, and punctuations.  
      
    Parameters:  
    reviews (pd.Series): Series containing the reviews  
      
    Returns:  
    pd.Series: Cleaned reviews  
    """  
    stop_words = set(stopwords.words('english'))  
      
    ## Remove negators from the stop words list to retain them in the text  
    negators = {'not', 'no', 'nor', 'never', 'none', 'nothing', 'nowhere', 'neither', 'hardly', 'scarcely', 'barely', 'don’t', 'isn’t', 'wasn’t', 'shouldn’t', 'wouldn’t', 'couldn’t', 'won’t', 'can’t', 'don’t'}  
    stop_words = stop_words - negators  
      
    def clean_review(review):  
        ## Remove HTML tags  
        review = BeautifulSoup(review, "html.parser").get_text()  
          
        ## Detect if the review is in English  
        try:  
            if detect(review) != 'en':  
                return None  
        except:  
            return None  
          
        ## Remove punctuations  
        review = re.sub(f"[{string.punctuation}]", " ", review)  
          
        ## Convert to lowercase and remove stop words  
        review = ' '.join([word.lower() for word in review.split() if word.lower() not in stop_words])  
        return review
    
    ## Apply the cleaning function to all reviews  
    cleaned_reviews = reviews.apply(clean_review)   
      
    return cleaned_reviews 

## Preprocess reviews  
balanced_train_set['cleaned_comments'] = preprocess_reviews(balanced_train_set['comments'])  

## Drop rows where cleaned_comments is None (non-English reviews)  
balanced_train_set = balanced_train_set.dropna(subset=['cleaned_comments'])
print(f"NEW DATA SIZE: {balanced_train_set.shape}")

NEW DATA SIZE: (117656, 30)


In [15]:
## Features
## Split the dataset into training and validation sets
train_df, val_df = train_test_split(balanced_train_set, test_size=0.2, random_state=42, stratify=balanced_train_set['polarity_class'].values)

## One-Hot encoding (polarity_class)
one_hot_encoder = OneHotEncoder(sparse=False)
train_df['polarity_class'] = list(one_hot_encoder.fit_transform(train_df[['polarity_class']]))
val_df['polarity_class'] = list(one_hot_encoder.fit_transform(val_df[['polarity_class']]))

## Convert the split dataframes to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

## Print the number of samples in each dataset to verify the split
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")

Number of training samples: 94124
Number of validation samples: 23532


In [16]:
## Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/rembert')

max_length = 256

## Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['cleaned_comments'], padding='max_length', truncation=True, max_length=max_length)

## Tokenize the training set
train_dataset = train_dataset.map(tokenize_function, batched=True)
## Rename the target column to 'labels'  
train_dataset = train_dataset.rename_column("polarity_class", "labels") 
train_dataset = train_dataset.remove_columns(['cleaned_comments'])

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

## Tokenize the validation set
val_dataset = val_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.rename_column("polarity_class", "labels") 
val_dataset = val_dataset.remove_columns(['cleaned_comments'])

val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

sentencepiece.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.71M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Map:   0%|          | 0/94124 [00:00<?, ? examples/s]

Map:   0%|          | 0/23532 [00:00<?, ? examples/s]

In [None]:
## Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('google/rembert', num_labels=balanced_train_set['polarity_class'].nunique())

## Freeze all layers except the classifier  
for name, param in model.named_parameters():  
    if 'classifier' not in name:  
        param.requires_grad = False 
        
## Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
)

## Define the compute_metrics function  
def compute_metrics(p):  
    pred_labels = p.predictions.argmax(-1)  
    accuracy = accuracy_score(p.label_ids, pred_labels)  
    report = classification_report(p.label_ids, pred_labels, output_dict=True)  
    return {  
        'accuracy': accuracy,  
        'report': report  
    }

## Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

## Train the model
trainer.train()

## Evaluate the model
results = trainer.evaluate()

## Print the results
print(f"Accuracy: {results['eval_accuracy']}")
print(f"Classification Report: {results['eval_report']}")

Some weights of RemBertForSequenceClassification were not initialized from the model checkpoint at google/rembert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
