In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from langdetect import detect  # For language detection

# Downloading NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Loading the CSV data into a DataFrame
data = pd.read_csv('singapore_airlines_reviews.csv')  # Replace 'your_data.csv' with the path to your CSV file

# Function to detect and filter out non-English text
def filter_non_english(text):
    try:
        if detect(text) == 'en':
            return True
        else:
            return False
    except:
        return False

# Filtering non-English text
data = data[data['text'].apply(filter_non_english)]

# Renaming the columns to English
data = data.rename(columns={'type': 'type', 'rating': 'rating', 'text': 'text'})

# Tokenization (to exclude non-English words)
data['tokens'] = data['text'].apply(lambda x: [word for word in word_tokenize(x.lower()) if word.isalpha()])

# Removing stopwords
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatization process
lemmatizer = WordNetLemmatizer()
data['tokens'] = data['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Text representation using Bag-of-Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['text'])
text_representation = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Selecting only the desired columns from the original DataFrame
data_selected_columns = data[['type', 'rating', 'text']]

data_with_representation = pd.concat([data_selected_columns, text_representation], axis=1)

# Displaying the DataFrame with tokenization and text representation
data_with_representation.head()


[nltk_data] Downloading package punkt to /Users/adese/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/adese/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/adese/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,type,rating,text,00,000,000ft,001,0010,0011,0025,...,천하의,첫째,태초에,하나님의,하나님이,하늘이라,하시고,하시니,혼돈하고,흑암이
0,review,3.0,We used this airline to go from Singapore to L...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,review,5.0,The service on Singapore Airlines Suites Class...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,review,1.0,"Booked, paid and received email confirmation f...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,review,5.0,"Best airline in the world, seats, food, servic...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,review,2.0,Premium Economy Seating on Singapore Airlines ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
