#Data_Type : Hotel/Resort Customer Reviews


5 insights:

-Location Distribution Analysis
Only 7 out of 23 reviews have a specified location
Most reviews (16) have no location specified


-Temporal Analysis
All reviews are from a short time frame (August 18-20, 2019)
Limited time period suggests a snapshot of reviews


-Sentiment Analysis
Tracking positive and negative keywords
Identifying common themes in reviews


-Review Length and Depth
Analyzing the complexity and detail of reviews


-Keyword Frequency
Identifying most common words and phrases





In [5]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from textblob import TextBlob

In [7]:
# Download NLTK stopwords if not already present
nltk.download('stopwords')
print(stopwords.words('english')[:10])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saikrishna_gajula/nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# Load the data
df = pd.read_csv('test.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'test.csv'

In [None]:
# Insight 1: Location Distribution Analysis
def analyze_location_distribution(df):
    """Analyzes and visualizes the distribution of reviews by location."""
    location_counts = df['Location'].value_counts()
    plt.figure(figsize=(10, 6))
    location_counts.plot(kind='bar')
    plt.title('Review Distribution by Location')
    plt.xlabel('Location')
    plt.ylabel('Number of Reviews')
    plt.tight_layout()
    plt.savefig('location_distribution.png')
    return location_counts

In [None]:
# Insight 2: Temporal Analysis
def temporal_analysis(df):
    """Analyzes and visualizes the distribution of reviews over time."""
    df['date'] = pd.to_datetime(df['date'])
    date_counts = df['date'].value_counts().sort_index()
    plt.figure(figsize=(10, 6))
    date_counts.plot(kind='line')
    plt.title('Reviews by Date')
    plt.xlabel('Date')
    plt.ylabel('Number of Reviews')
    plt.tight_layout()
    plt.savefig('temporal_analysis.png')
    return date_counts

In [None]:
# Insight 3: Sentiment Analysis
def perform_sentiment_analysis(df):
    """Performs sentiment analysis by counting positive and negative keywords."""
    positive_keywords = ['great', 'loved', 'excellent', 'nice', 'clean', 'comfortable', 'friendly']
    negative_keywords = ['old', 'musty', 'small', 'needs work', 'thin', 'poor', 'unresponsive']
    
    def count_sentiment_keywords(review):
        review = review.lower()
        pos_count = sum(keyword in review for keyword in positive_keywords)
        neg_count = sum(keyword in review for keyword in negative_keywords)
        return pos_count, neg_count
    
    df[['positive_count', 'negative_count']] = df['Review'].apply(
        lambda x: pd.Series(count_sentiment_keywords(x))
    )
    
    plt.figure(figsize=(10, 6))
    plt.scatter(df['positive_count'], df['negative_count'])
    plt.title('Sentiment Analysis: Positive vs Negative Keywords')
    plt.xlabel('Positive Keyword Count')
    plt.ylabel('Negative Keyword Count')
    plt.tight_layout()
    plt.savefig('sentiment_scatter.png')
    return df[['positive_count', 'negative_count']]

In [None]:
# Insight 4: Review Length Analysis
def analyze_review_length(df):
    """Analyzes and visualizes the length of reviews."""
    df['review_length'] = df['Review'].str.len()
    plt.figure(figsize=(10, 6))
    df['review_length'].plot(kind='hist', bins=20)
    plt.title('Distribution of Review Lengths')
    plt.xlabel('Review Length (characters)')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig('review_length_distribution.png')
    return df['review_length']

In [None]:
# Insight 5: Keyword Frequency Analysis
def analyze_keyword_frequency(df):
    """Analyzes and visualizes the frequency of keywords in reviews."""
    all_reviews = ' '.join(df['Review'].fillna(''))
    stop_words = set(stopwords.words('english'))
    words = re.findall(r'\w+', all_reviews.lower())
    filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
    word_freq = Counter(filtered_words)
    top_words = word_freq.most_common(15)
    
    plt.figure(figsize=(12, 6))
    plt.bar([word[0] for word in top_words], [word[1] for word in top_words])
    plt.title('Top 15 Most Frequent Words')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('keyword_frequency.png')
    return dict(top_words)

In [None]:
# New Insight: Polarity and Subjectivity Analysis
def analyze_polarity_subjectivity(df):
    """Analyzes the polarity and subjectivity of reviews."""
    df['polarity'] = df['Review'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['subjectivity'] = df['Review'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    
    plt.figure(figsize=(10, 6))
    sns.histplot(df['polarity'], kde=True, bins=20, color='blue')
    plt.title('Polarity Distribution of Reviews')
    plt.xlabel('Polarity')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig('polarity_distribution.png')
    
    plt.figure(figsize=(10, 6))
    sns.histplot(df['subjectivity'], kde=True, bins=20, color='green')
    plt.title('Subjectivity Distribution of Reviews')
    plt.xlabel('Subjectivity')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig('subjectivity_distribution.png')
    
    return df[['polarity', 'subjectivity']]

In [None]:
# New Insight: Word Cloud Generation
def generate_word_cloud(df, sentiment='positive'):
    """Generates a word cloud for positive or negative reviews."""
    if sentiment == 'positive':
        reviews = ' '.join(df[df['positive_count'] > df['negative_count']]['Review'].fillna(''))
    else:
        reviews = ' '.join(df[df['negative_count'] > df['positive_count']]['Review'].fillna(''))
    
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(reviews)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{sentiment.capitalize()} Review Word Cloud')
    plt.tight_layout()
    plt.savefig(f'{sentiment}_word_cloud.png')
    plt.show()

In [None]:
# Main Analysis Function
def comprehensive_hotel_review_analysis(df):
    """Performs a comprehensive analysis of hotel reviews."""
    results = {
        'Location Distribution': analyze_location_distribution(df),
        'Temporal Analysis': temporal_analysis(df),
        'Sentiment Analysis': perform_sentiment_analysis(df),
        'Review Length': analyze_review_length(df),
        'Keyword Frequency': analyze_keyword_frequency(df),
        'Polarity and Subjectivity': analyze_polarity_subjectivity(df)
    }
    
    # Generate word clouds for visual insights
    generate_word_cloud(df, 'positive')
    generate_word_cloud(df, 'negative')
    
    return results

In [None]:
# Run the analysis
analysis_results = comprehensive_hotel_review_analysis(df)



In [None]:
# Print out key findings
print("Analysis Complete. Check the generated visualizations!")
for key, value in analysis_results.items():
    print(f"\n{key}:")
    print(value)