In [None]:
import os
import pandas as pd

# 1. Data Load

- Check the data list of the airb dataset

In [None]:
data_dir = '../data'
airb_dir = os.path.join(data_dir, 'airb')
print(os.listdir(airb_dir))

In [None]:
airb_csv = os.path.join(airb_dir, 'AirBNBReviews.csv')
print(airb_csv)
print("Check in csv file: ", os.path.isfile(airb_csv))

In [None]:
airb_df = pd.read_csv(airb_csv)

# 2. Data Analysis

## Check the data

In [None]:
print(airb_df)

In [None]:
print(airb_df.info())

- check the label distribution

In [None]:
print(airb_df.groupby(['Genre', 'Positive or Negative']).size())

## Plot the data distribution

In [None]:
import matplotlib.pyplot as plt

# Assuming 'data' is your DataFrame and it has columns 'Genre' and 'Positive or Negative'
airb_df.groupby(['Genre', 'Positive or Negative']).size().unstack().plot(kind='bar', stacked=True)
plt.show()

#### Word Clouds
Word clouds can give you a visual representation of the most frequently used words in your dataset. The more a specific word appears in your text, the bigger and bolder it appears in the word cloud.

In [None]:
from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# Let's assume 'data' is your DataFrame and it has a column 'Review'
text = ' '.join(review for review in airb_df.Review)
wordcloud = WordCloud(background_color='white').generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


- check the above the error!!
    - why the error is occured?
    - how to solve the error?
    - ***Check the data Right Now!!***

In [None]:
print(airb_df.isnull().sum())

In [None]:
from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# Let's assume 'data' is your DataFrame and it has a column 'Review'
airb_df['Review'] = airb_df['Review'].fillna('') # resolve the NaN issue
text = ' '.join(review for review in airb_df.Review)
wordcloud = WordCloud(background_color='white').generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from wordcloud import STOPWORDS

# First, we'll replace NaNs with empty strings just in case
airb_df['Review'] = airb_df['Review'].fillna('')

# Define your own set of stopwords, or use the one provided by WordCloud
stopwords = set(STOPWORDS)

# Update the set of stopwords with any additional words you want to ignore
# For example, if you want to ignore the words 'the' and 'and', you would do:
# stopwords.update(['the', 'and'])

def generate_wordcloud(reviews):
    text = ' '.join(review for review in reviews)
    # wordcloud = WordCloud(stopwords=stopwords, background_color='white').generate(text)
    wordcloud = WordCloud(stopwords=stopwords, background_color='white', random_state=1502).generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# Assuming 'airb_df' is your DataFrame and it has columns 'Review' and 'Positive or Negative'
positive_reviews = airb_df[airb_df['Positive or Negative'] == 1]['Review']
negative_reviews = airb_df[airb_df['Positive or Negative'] == 0]['Review']

print("Word cloud for positive reviews")
generate_wordcloud(positive_reviews)

print("Word cloud for negative reviews")
generate_wordcloud(negative_reviews)


#### Histogram of Review Lengths

This can give you an idea of the distribution of the lengths of reviews.

In [None]:
airb_df['Review'].str.len().hist()
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.show()

#### Pie Chart of Sentiment Distribution:

This will give a good view of the data balance between positive and negative reviews.

In [None]:
airb_df['Positive or Negative'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.show()

#### Heatmap of Term Frequency

 You could create a document-term matrix and then plot a heatmap to visualize the frequency of terms.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(airb_df['Review'])
terms = vectorizer.get_feature_names_out()
frequency = X.toarray().sum(axis=0)

df = pd.DataFrame(frequency, index=terms, columns=["Frequency"])

# Top 20 frequent terms
top_terms = df.sort_values(by="Frequency", ascending=False).head(20)

sns.heatmap(top_terms, annot=True, cmap="YlGnBu")
plt.show()


### 3. Select Model for Sentiment Analysis

- We will use the Bert Model for sentiment analysis.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [None]:
print('Raw text: ', airb_df['Review'][0])
print('Tokenized: ', tokenizer.tokenize(airb_df['Review'][0]))

#### Encode The Review Sentiments

In [None]:
encoded_input = tokenizer(airb_df['Review'][0])

In [None]:
print("Tokens for the first review: ", tokenizer.convert_ids_to_tokens(encoded_input["input_ids"]))
print("Token IDs for the first review: ", encoded_input["input_ids"])

#### Check the Vocabulary

In [None]:
vocab = tokenizer.get_vocab()

# If you want to print only a part of the vocabulary (e.g., the first 10 items), you can do:
for i, (token, token_id) in enumerate(vocab.items()):
    print(token, token_id)
    if i > 10:
        break

In [None]:
id_to_token = {id: token for token, id in vocab.items()}

# Replace 1012 with the ID you want to look up
token = id_to_token[1012]
print("Token for ID 1012:", token)

#### Decode the Reviews from Tokens to Words

In [None]:
encoded_review = encoded_input["input_ids"]
print("Token IDs for the first review: ",encoded_review)

In [None]:
decoded_review = tokenizer.decode(encoded_review)
print("Decoded review: ", decoded_review)

# 3. Split the Data

- Split the data into train and test sets.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_data, test_data = train_test_split(airb_df, test_size=0.2, random_state=42)

In [None]:
print("Length of training data: ", len(train_data))
print("Length of test data: ", len(test_data))