In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import os
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Download stopwords and punkt tokenizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Text Collection

In [None]:
# Transforming the main csv file into different Text files with Reviews

# Load the CSV file
file_path = "/content/drive/MyDrive/Colab Data Files /SADB/tripadvisor_hotel_reviews.csv"
data = pd.read_csv(file_path)

first_55_reviews = data['Review'].head(55)

output_folder = "/content/drive/MyDrive/Colab Data Files /SADB/Review Text Files"
output_folder_path = Path(output_folder)
output_folder_path.mkdir(exist_ok=True)

# Saving each review as a seperate text file
for i, review in enumerate(first_55_reviews, start=1):
    file_name = output_folder_path / f"review_{i}.txt"
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(str(review))
print(f"Reviews have been saved in the folder: {output_folder_path}")


Reviews have been saved in the folder: /content/drive/MyDrive/Colab Data Files /SADB/Review Text Files


In [None]:
# Text Preprocessing
# Normalisation (Lowercasing), Tokenization, Removal of punctuation and stop words

base_directory = '/content/drive/MyDrive/Colab Data Files /SADB/'
input_directory = os.path.join(base_directory, 'Review Text Files')
processed_dir = os.path.join(base_directory, 'Processed Text')

# Making processed directry if it doesn't exist
os.makedirs(processed_dir, exist_ok=True)

stop_words = set(stopwords.words('english'))


for file_name in os.listdir(input_directory):

    file_path = os.path.join(input_directory, file_name)
    with open(file_path, 'r') as file:
        document_content = file.read()

    # Lowercasing
    document_content = document_content.lower()

    # Tokenize the text
    tokens = word_tokenize(document_content)

    # Removal of punctuation and stop words
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word not in stop_words]

    cleaned_content = ' '.join(tokens)

    # Saving the processed text
    processed_file_path = os.path.join(processed_dir, file_name[:-4] + "_processed.txt")
    with open(processed_file_path, 'w') as processed_file:
        processed_file.write(cleaned_content)

    print(f"Processed and saved file: {file_name}")


Processed and saved file: review_1.txt
Processed and saved file: review_10.txt
Processed and saved file: review_3.txt
Processed and saved file: review_2.txt
Processed and saved file: review_5.txt
Processed and saved file: review_9.txt
Processed and saved file: review_7.txt
Processed and saved file: review_8.txt
Processed and saved file: review_4.txt
Processed and saved file: review_6.txt
Processed and saved file: review_11.txt
Processed and saved file: review_13.txt
Processed and saved file: review_15.txt
Processed and saved file: review_12.txt
Processed and saved file: review_14.txt
Processed and saved file: review_16.txt
Processed and saved file: review_18.txt
Processed and saved file: review_17.txt
Processed and saved file: review_19.txt
Processed and saved file: review_20.txt
Processed and saved file: review_21.txt
Processed and saved file: review_22.txt
Processed and saved file: review_24.txt
Processed and saved file: review_23.txt
Processed and saved file: review_25.txt
Processed

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

processed_reviews_folder = "/content/drive/MyDrive/Colab Data Files /SADB/Processed Text"

processed_reviews = []

# Reading file Content
for filename in os.listdir(processed_reviews_folder):
    file_path = os.path.join(processed_reviews_folder, filename)
    with open(file_path, 'r') as file:
        review_text = file.read()
        processed_reviews.append(review_text)

# Bag-of-Words representation
vectorizer = CountVectorizer(lowercase=False)
bow_matrix = vectorizer.fit_transform(processed_reviews)

# Shape of the matrix
print("Number of files and the number of unique words overall:", bow_matrix.shape)

# Print vocabulary with indices
print("\nVocabulary:")
words = list(vectorizer.vocabulary_.items())
for i in range(0, len(words), 5):
    row_words = words[i:i + 5]
    formatted_words = [f"{word:<15}: {index:>5}" for word, index in row_words]
    print(" ".join(formatted_words))


Number of files and the number of unique words overall: (55, 1713)

Vocabulary:
nice           :  1019 hotel          :   763 expensive      :   551 parking        :  1100 got            :   683
good           :   682 deal           :   419 stay           :  1456 anniversary    :   114 arrived        :   137
late           :   854 evening        :   538 took           :  1551 advice         :    84 previous       :  1173
reviews        :  1283 valet          :  1616 check          :   297 quick          :  1208 easy           :   506
little         :   886 disappointed   :   456 non            :  1028 existent       :   545 view           :  1624
room           :  1291 clean          :   318 size           :  1382 bed            :   188 comfortable    :   338
woke           :  1687 stiff          :  1464 neck           :  1004 high           :   735 pillows        :  1131
soundproof     :  1415 like           :   877 heard          :   719 music          :   996 night          :  1023


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

directory = "/content/drive/MyDrive/Colab Data Files /SADB/Processed Text"

# Reading all the text files
documents = []
file_names = []
for filename in sorted(os.listdir(directory)):
    if filename.endswith(".txt"):
        file_names.append(filename)
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            documents.append(file.read())

# Applying TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)


feature_names = vectorizer.get_feature_names_out()

# Printing feature names with their TF-IDF values for each document
for doc_idx, file_name in enumerate(file_names):
    print(f"\nTF-IDF values for {file_name}:")
    row = tfidf_matrix[doc_idx]  # Sparse row for the document
    feature_index = row.nonzero()[1]  # Non-zero feature indices
    tfidf_scores = zip(feature_index, [row[0, x] for x in feature_index])
    for idx, score in sorted(tfidf_scores, key=lambda x: -x[1]):  # Sort by descending score
        print(f"{feature_names[idx]}: {score}")



TF-IDF values for review_10_processed.txt:
reception: 0.35244101927532806
delight: 0.22349020970235384
smart: 0.22349020970235384
mild: 0.22349020970235384
negative: 0.22349020970235384
uphill: 0.22349020970235384
ppmarket: 0.22349020970235384
particularly: 0.20257303420307982
guests: 0.20257303420307982
1st: 0.20257303420307982
past: 0.18773206200316486
professional: 0.18773206200316486
liked: 0.18773206200316486
dog: 0.18773206200316486
received: 0.18773206200316486
spoke: 0.17622050963766403
loved: 0.16681488650389084
overall: 0.15886254545456185
staff: 0.1546602194199553
distance: 0.15197391430397592
small: 0.1404623619384751
restaurants: 0.1404623619384751
experience: 0.1404623619384751
excellent: 0.11954518643920106
comfortable: 0.11621576660478695
monaco: 0.1130882574091666
friendly: 0.1130882574091666
stayed: 0.09978734292787703
bed: 0.09978734292787703
great: 0.0743814156062389
room: 0.06286986324073808
hotel: 0.060626548834985444

TF-IDF values for review_11_processed.txt:
a

In [None]:
metadata = {
    "vectorization_process": {
        "description": "This file outlines the process of vectorising text data.",
        "steps": [
            {
                "step": "Lowercasing",
                "description": "Converting all text to lowercase to ensure uniformity.",
                "tools": ["str.lower()"]
            },
            {
                "step": "Tokenization",
                "description": "This is splitting text into individual words/tokens.",
                "tools": ["nltk.word_tokenize"]
            },
            {
                "step": "Punctuation Removal",
                "description": "This is to remove common punctuation marks.",
                "tools": ["string.punctuation"]
            },
            {
                "step": "Stop Word Removal",
                "description": "This is to remove common words like 'and', 'the', 'for', etc.",
                "tools": ["nltk.corpus.stopwords"]
            },
            {
                "step": "Vectorization",
                "description": "This is the process of creating numerical representations from text data. Methods include Bag of Words and TF-IDF.",
                "tools": ["sklearn.feature_extraction.text.CountVectorizer", "sklearn.feature_extraction.text.TfidfVectorizer"]
            }
        ]
    },
    "sentiment_labels": [
        {
            "filename": "review_1_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_2_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_3_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_4_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_5_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_6_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_7_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_8_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_9_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_10_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_11_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_12_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_13_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_14_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_15_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_16_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_17_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_18_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_19_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_20_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_21_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_22_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_23_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_24_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_25_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_26_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_27_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_28_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_29_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_30_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_31_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_32_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_33_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_34_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_35_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_36_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_37_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_38_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_39_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_40_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_41_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_42_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_43_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_44_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_45_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_46_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_47_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_48_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_49_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_50_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_51_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_52_processed.txt",
            "sentiment": "positive"
        },
        {
            "filename": "review_53_processed.txt",
            "sentiment": "neutral"
        },
        {
            "filename": "review_54_processed.txt",
            "sentiment": "negative"
        },
        {
            "filename": "review_55_processed.txt",
            "sentiment": "positive"
        }
    ]
}


In [None]:
# Save metadata as a JSON file
import json
output_path = "/content/drive/MyDrive/Colab Data Files /SADB/metadata_for_reviews.json"
with open(output_path, "w") as json_file:
    json.dump(metadata, json_file, indent=4)
print(f"Metadata file saved to: {output_path}")


# Print the sentiment labels
print("\n".join(f"{item['filename']}: {item['sentiment']}" for item in metadata["sentiment_labels"]))


Metadata file saved to: /content/drive/MyDrive/Colab Data Files /SADB/metadata_for_reviews.json
review_1_processed.txt: positive
review_2_processed.txt: negative
review_3_processed.txt: neutral
review_4_processed.txt: positive
review_5_processed.txt: negative
review_6_processed.txt: neutral
review_7_processed.txt: positive
review_8_processed.txt: negative
review_9_processed.txt: neutral
review_10_processed.txt: positive
review_11_processed.txt: negative
review_12_processed.txt: positive
review_13_processed.txt: neutral
review_14_processed.txt: negative
review_15_processed.txt: positive
review_16_processed.txt: neutral
review_17_processed.txt: positive
review_18_processed.txt: negative
review_19_processed.txt: neutral
review_20_processed.txt: positive
review_21_processed.txt: negative
review_22_processed.txt: neutral
review_23_processed.txt: positive
review_24_processed.txt: negative
review_25_processed.txt: neutral
review_26_processed.txt: positive
review_27_processed.txt: negative
rev

In [None]:
pip install pymongo

Collecting pymongo
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [None]:
import pymongo
from pymongo import MongoClient

In [None]:
reviews_connection = pymongo.MongoClient("mongodb+srv://frankie20231330:VzqxnKL2T6KuvYsx@cluster1.noye9.mongodb.net/?retryWrites=true&w=majority&appName=Cluster1")

db = reviews_connection["Text_Database"]

db

Database(MongoClient(host=['cluster1-shard-00-02.noye9.mongodb.net:27017', 'cluster1-shard-00-00.noye9.mongodb.net:27017', 'cluster1-shard-00-01.noye9.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='Cluster1', authsource='admin', replicaset='atlas-8woafj-shard-0', tls=True), 'Text_Database')

In [None]:
# Test the connection by listing collections
    print(db.list_collection_names())
except Exception as e:
    print(f"Connection failed: {e}")

['Sentiment_Labels', 'Reviews']


In [None]:
folder_path = "/content/drive/MyDrive/Colab Data Files /SADB/Processed Text"

collection = db["Reviews"]

# Insert each Review into MongoDB
for filename in os.listdir(folder_path):
    if filename.endswith("_processed.txt"):
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r') as file:
            content = file.read()

        document = {
            "filename": filename,
            "content": content
        }

        collection.insert_one(document)
        print(f"Inserted {filename} into MongoDB.")

print("All files have been inserted into the MongoDB collection.")


Inserted review_1_processed.txt into MongoDB.
Inserted review_10_processed.txt into MongoDB.
Inserted review_3_processed.txt into MongoDB.
Inserted review_2_processed.txt into MongoDB.
Inserted review_5_processed.txt into MongoDB.
Inserted review_9_processed.txt into MongoDB.
Inserted review_7_processed.txt into MongoDB.
Inserted review_8_processed.txt into MongoDB.
Inserted review_4_processed.txt into MongoDB.
Inserted review_6_processed.txt into MongoDB.
Inserted review_11_processed.txt into MongoDB.
Inserted review_13_processed.txt into MongoDB.
Inserted review_15_processed.txt into MongoDB.
Inserted review_12_processed.txt into MongoDB.
Inserted review_14_processed.txt into MongoDB.
Inserted review_16_processed.txt into MongoDB.
Inserted review_18_processed.txt into MongoDB.
Inserted review_17_processed.txt into MongoDB.
Inserted review_19_processed.txt into MongoDB.
Inserted review_20_processed.txt into MongoDB.
Inserted review_21_processed.txt into MongoDB.
Inserted review_22_pro

In [None]:
# Define the collection
reviews_collection = db["Sentiment_Labels"]

with open('/content/drive/MyDrive/Colab Data Files /SADB/metadata_for_reviews.json', 'r') as f:
    metadata = json.load(f)

# Isolate sentiment labels and insert into MongoDB
for item in metadata["sentiment_labels"]:
    document = {
        "filename": item["filename"],
        "sentiment": item["sentiment"]
    }
    reviews_collection.insert_one(document)

print("Sentiment metadata inserted into MongoDB")

Sentiment data inserted into MongoDB


In [None]:
filename_to_search = "review_10_processed.txt"

# Query for the file and sentiment in respective collections
document = db.Reviews.find_one({"filename": filename_to_search})
sentiment_document = db.Sentiment_Labels.find_one({"filename": filename_to_search}) if document else None

if document:
    print(f"Filename: {document['filename']}\nText: {document['content']}")
    print(f"Sentiment: {sentiment_document['sentiment']}" if sentiment_document else "Sentiment not found for this file.")
else:
    print("No document found with the specified filename.")


Filename: review_10_processed.txt
Text: excellent stayed hotel monaco past w/e delight reception staff friendly professional room smart comfortable bed particularly liked reception small dog received staff guests spoke loved mild negative distance uphill ppmarket restaurants 1st overall great experience
Sentiment: positive
