In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Downloading Data

In [None]:
df=pd.read_csv("/kaggle/input/restaurant-reviews/Restaurant reviews.csv")
df.head()

## Preprocessing

In [None]:
df = df.drop(["Restaurant", "Reviewer", "Metadata", "Pictures" ,"7514"], axis=1)

In [None]:
df["Time"] = list(map(lambda data: str(data).split()[0], df["Time"]))
df["Time"] = list(map(lambda data: str(data).split("/")[-1], df["Time"]))

In [None]:
df['Rating'] = np.where(df["Rating"] == "Like", df['Rating'].value_counts().idxmax(), df['Rating'])

In [None]:
df["Rating"] = list(map(lambda data: float(data) >= 3, df["Rating"]))

In [None]:
df['Time'] = np.where(df["Time"] == "nan", df['Time'].value_counts().idxmax(), df['Time'])

In [None]:
from sklearn.preprocessing import MinMaxScaler
mmsTime = MinMaxScaler()

mmsTime.fit(df[["Time"]])
df["Time"] = mmsTime.transform(df[["Time"]])

In [None]:
df["Review"] = df["Review"].fillna("Nothing")
df['Rating'] = df['Rating'].astype(int)
df.rename(columns={'Rating': 'target'}, inplace=True)

In [None]:
def create_corpus_df(review, target):
    corpus=[]
    
    for x in review[review['target']==target]['Review'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
from collections import defaultdict

restaurant_reviews = create_corpus_df(df, 1)

dic=defaultdict(int)
for word in restaurant_reviews:
    dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]
top

In [None]:
# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
        '', 
        text
    )
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [None]:
import nltk
from nltk.corpus import stopwords

# Download the stopwords from NLTK
nltk.download('stopwords')

In [None]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

stemmer = nltk.SnowballStemmer("english")

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords and Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' ') if word not in stop_words)

    return text

In [None]:
import  re
import string 

df['Review'] = df['Review'].apply(preprocess_data)
df.head()

In [None]:
def create_corpus_df(review, target):
    corpus=[]
    
    for x in review[review['target']==target]['Review'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
from collections import defaultdict

restaurant_reviews = create_corpus_df(df, 1)

dic=defaultdict(int)
for word in restaurant_reviews:
    dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]
top

## Modeling

In [None]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
x = df['Review']
y = df['target']

    # Split into train and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

### XGBoost 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import metrics
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='auc',
    ))
])
from sklearn import metrics

# Fit the pipeline with the data
pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)

print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, y_pred_class)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

### LightGBM 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import metrics
import lightgbm as lgb  # Importing LightGBM
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Define the pipeline using LightGBM classifier
pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', lgb.LGBMClassifier(
        objective='binary',  # or 'multiclass' for multi-class classification
        metric='auc',
    ))
])

# Fit the pipeline with the data
pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)

print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, y_pred_class)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()


### LSTM 

In [None]:
train_reviews = df['Review'].values
train_target = df['target'].values

In [None]:
# Calculate the length of our vocabulary
from keras.preprocessing.text import Tokenizer
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train_reviews)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

In [None]:
def show_metrics(pred_tag, y_test):
    print("F1-score: ", f1_score(pred_tag, y_test))
    print("Precision: ", precision_score(pred_tag, y_test))
    print("Recall: ", recall_score(pred_tag, y_test))
    print("Acuracy: ", accuracy_score(pred_tag, y_test))
    print("-"*50)
    print(classification_report(pred_tag, y_test))
    
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming you have already downloaded the NLTK tokenizers
nltk.download('punkt')

longest_train = max(train_reviews, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

train_padded_sentences = pad_sequences(
    embed(train_reviews), 
    length_long_sentence, 
    padding='post'
)

train_padded_sentences

In [None]:
embedding_dim = 100
embeddings_dictionary = dict()
embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    train_padded_sentences, 
    train_target, 
    test_size=0.25
)

In [None]:
def glove_lstm():
    model = Sequential()
    
    model.add(Embedding(
        input_dim=embedding_matrix.shape[0], 
        output_dim=embedding_matrix.shape[1], 
        weights = [embedding_matrix], 
        input_length=length_long_sentence
    ))
    
    model.add(Bidirectional(LSTM(
        length_long_sentence, 
        return_sequences = True, 
        recurrent_dropout=0.2
    )))
    
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, GlobalMaxPool1D, BatchNormalization, Dense, Dropout
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

In [None]:
model = glove_lstm()

checkpoint = ModelCheckpoint(
    'model.h5', 
    monitor = 'val_loss', 
    verbose = 1, 
    save_best_only = True
)
reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss', 
    factor = 0.2, 
    verbose = 1, 
    patience = 5,                        
    min_lr = 0.001
)
history = model.fit(
    X_train, 
    y_train, 
    epochs = 6,
    batch_size = 32,
    validation_data = (X_test, y_test),
    verbose = 1,
    callbacks = [reduce_lr, checkpoint]
)

In [None]:
import matplotlib.pyplot as plt

def plot_learning_curves(history, metrics):
    plt.figure(figsize=(12, 4))

    for i, metric in enumerate(metrics):
        plt.subplot(1, len(metrics), i + 1)
        for m in metric:
            plt.plot(history.history[m], label=m)
        plt.title('Model {}'.format(metric[0]))
        plt.xlabel('Epochs')
        plt.ylabel(metric[0])
        plt.legend()

    plt.show()

# これで関数を使用してプロットを表示できます
plot_learning_curves(history, [['loss', 'val_loss'], ['accuracy', 'val_accuracy']])
