
# Airbnb Review Analysis
This notebook performs an analysis of Airbnb reviews to identify patterns in sentiment changes and to perform sequence analysis to prioritize aspects of the product/service.



## Step 1: Load Data and Preprocess
First, load the data and preprocess it by expanding contractions, detecting the language, and cleaning the text.


In [3]:
import pandas as pd
import re
import nltk
from langdetect import detect
import unidecode
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

# Custom contraction mapping
CONTRACTION_MAP = {
    "can't": "cannot", "won't": "will not", "i'm": "i am", "i'd": "i would", "i've": "i have",
    "you're": "you are", "he's": "he is", "she's": "she is", "it's": "it is", "they're": "they are",
    "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "didn't": "did not",
    "don't": "do not", "hasn't": "has not", "haven't": "have not", "hadn't": "had not", "shouldn't": "should not",
    "wouldn't": "would not", "couldn't": "could not", "mightn't": "might not", "mustn't": "must not"
}

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        expanded_contraction = contraction_mapping.get(match.lower())
        expanded_contraction = match[0] + expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# Load the dataset
df = pd.read_csv('/Users/abhinandandas/Downloads/reviews-manc.csv')

# Drop missing values
df.dropna(inplace=True)
df.drop(columns={'reviewer_name'}, inplace=True)

# Expand contractions
df['comments'] = df['comments'].apply(lambda x: expand_contractions(x))

# Detect language and filter for English reviews
df['Langu'] = [detect(elem) if len(elem) > 50 else 'no' for elem in df['comments']]
df = df[df['Langu'] == 'en']

# Convert to ASCII
df['comments'] = df['comments'].apply(lambda x: unidecode.unidecode(x))

# Parse the date column and handle errors
def parse_date(date_str):
    try:
        return pd.to_datetime(date_str, format='%Y-%m-%d')
    except ValueError:
        try:
            return pd.to_datetime(date_str, format='%d/%m/%Y')
        except ValueError:
            return pd.NaT

df['date'] = df['date'].apply(parse_date)

# Filter reviews from 2019 onwards
df = df[df['date'].dt.year >= 2019]

# Convert comments to lowercase
df['comments'] = df['comments'].str.lower()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abhinandandas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhinandandas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



## Step 2: Aspect Detection and Sentiment Analysis
Using the existing aspect detection code, extract noun phrases and perform sentiment analysis on the comments.


In [4]:

# Function to detect noun phrases
def detect_noun_phrases(text):
    grammar = r"""
      NP: {<JJ><NN.*>+}               # Chunk JJ followed by NN
      NP1: {<NN.*>+<JJ>}
      NP2: {<NN.*>+<JJ><JJ>*}
      NP3: {<NN.*>+<VB.*>*<JJ><JJ>*}
      NP4: {<NN.*>+<VB.*>*<RB.*>*<JJ><JJ>*}
      NP5: {<PRP$><NN.*>+<JJ><JJ>*}
      NP6: {<NN.*>+<VBZ>*<RB>*<JJ><JJ>*}
      """
    parser = nltk.RegexpParser(grammar)
    noun_phrases = []
    sentences = sent_tokenize(text)
    for sentence in sentences:
        words = word_tokenize(sentence)
        pos_tokens = nltk.pos_tag(words)
        parsed_sentence = parser.parse(pos_tokens)
        for chunk in parsed_sentence.subtrees():
            if chunk.label().startswith('NP'):
                noun_phrase = ' '.join(word for word, pos in chunk)
                noun_phrases.append(noun_phrase)
    return noun_phrases

# Apply noun phrase detection
df['noun_phrases'] = df['comments'].apply(detect_noun_phrases)

# Perform sentiment analysis
from textblob import TextBlob

def sentiment_analysis(text):
    return TextBlob(text).sentiment.polarity

df['sentiment'] = df['comments'].apply(sentiment_analysis)



## Step 3: Analyze Patterns in Sentiments
Analyze the sequence of sentiments in the comments to identify patterns.


In [None]:

# Sort the DataFrame by date
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by='date', inplace=True)

# Calculate sentiment change
df['sentiment_change'] = df['sentiment'].diff()

# Identify patterns
def identify_sentiment_pattern(change):
    if change > 0:
        return 'positive'
    elif change < 0:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_pattern'] = df['sentiment_change'].apply(identify_sentiment_pattern)

# Save the results
df.to_csv('path_to_save_patterns.csv', index=False)



## Step 4: Sequence Analysis for Aspect Priority
Perform sequence analysis to prioritize the aspects of the product/service.


In [None]:

from collections import defaultdict

# Initialize a dictionary to hold aspect sequences
aspect_sequences = defaultdict(list)

# Extract aspects and their order
for index, row in df.iterrows():
    aspects = row['noun_phrases']
    for i, aspect in enumerate(aspects):
        aspect_sequences[aspect].append(i)

# Calculate the average position of each aspect
aspect_priorities = {aspect: sum(positions)/len(positions) for aspect, positions in aspect_sequences.items()}

# Convert to DataFrame for easier analysis
aspect_priorities_df = pd.DataFrame(list(aspect_priorities.items()), columns=['Aspect', 'Average_Position'])

# Sort by average position
aspect_priorities_df.sort_values(by='Average_Position', inplace=True)

# Save the results
aspect_priorities_df.to_csv('path_to_save_aspect_priorities.csv', index=False)
