### Imports

In [None]:
import matplotlib.pyplot as plt
import json
import pandas as pd
import seaborn as sns
import re
from transformers import BertTokenizer, BertForSequenceClassification,AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.decomposition import PCA
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
chunk_size = 700000
json_file_path = "yelp_academic_dataset_review.json"

# Read the JSON file in chunks
chunks = pd.read_json(json_file_path, lines=True, chunksize=chunk_size)

for i, df_chunk in enumerate(chunks):
    print(f"Processing chunk {i + 1}")

    chunk_csv_path = f"chunk_{i + 1}.csv"
    df_chunk.to_csv(chunk_csv_path, index=False)
    df_chunk_from_csv = pd.read_csv(chunk_csv_path)

In [None]:
# Using the first 100000
chunk_size = 700000
file= "yelp_academic_dataset_review.json"

# Reading the JSON file in chunks
chunks1 = pd.read_json(file, lines=True, chunksize=chunk_size)

# Iterate over chunks and process each chunk
for i, x in enumerate(chunks1):
    print(f"Processing chunk {i + 1}")

    # Save the first chunk to a CSV file
    if i == 0:
        first_chunk_csv_path = "first_chunk.csv"
        x.to_csv(first_chunk_csv_path, index=False)

    df= pd.read_csv("first_chunk.csv")
    break

In [None]:
df

In [None]:

### Visualization
#%%
# Data Preprocessing
texts = df['text']
labels = df['stars'].apply(lambda x: 1 if x > 3 else 0)  


In [None]:

df


In [None]:

plt.figure(figsize=(8, 6))
plt.scatter(df['stars'].value_counts().sort_index().index, df['stars'].value_counts().sort_index(), color='purple')
plt.title('Ratings')
plt.xlabel('Stars')
plt.ylabel('Count')
plt.show()


In [None]:

# Plot the distribution of review lengths
plt.figure(figsize=(8, 6))
df['review_length'] = df['text'].apply(len)
df['review_length'].hist(bins=50, color='red')
plt.title('Lengths')
plt.xlabel('Reviews')
plt.ylabel('Amount')
plt.show()


In [None]:

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))

df.boxplot(column='useful', by='stars', ax=axes[0])
axes[0].set_title('Votes by Stars')

df.boxplot(column='funny', by='stars', ax=axes[1])
axes[1].set_title('Votes by Stars')

df.boxplot(column='cool', by='stars', ax=axes[2])
axes[2].set_title('Votes by Stars')

plt.suptitle('Features by Stars')
plt.show()


In [None]:

### Data Pre-processing
#%%
#### Drop duplicates
df = df.drop_duplicates(subset=['review_id'])
df = df.drop_duplicates(subset=['user_id'])
df = df.drop_duplicates(subset=['business_id'])
df


In [None]:

# Remove Null values 
nulls= df.isnull().sum()
print("Nulls:\n", nulls)


In [None]:

import re
df['text'] = df['text'].apply(lambda x: re.sub(r'<.*?>', '', x))
##%%
# Remove special characters 
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))


In [None]:

df['date'] = pd.to_datetime(df['date'])


In [None]:

token = Tokenizer()
token.fit_on_texts(texts)
s = token.texts_to_sequences(texts)
index = token.word_index
maximumlength = 100  # Set your desired sequence length
data = pad_sequences(s, maxlen=maximumlength)


In [None]:

# Encode labels
l_encode = LabelEncoder()
labels = l_encode.fit_transform(labels)


In [None]:

### Train and Test Split
#%%
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.18, random_state=42)


In [None]:

### LSTM Model
#%%
model = Sequential()
model.add(Embedding(len(index) + 1, 100, input_length=maximumlength))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


In [None]:

### Compile the model
#%%
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:

# Training with Data Generator
def data_generator(data, labels, batch_size):
    samples = len(data)
    while True:
        s_indices = np.arange(samples)
        np.random.shuffle(s_indices)
        for i in range(0,samples, batch_size):
            b_indices = s_indices[i:i+batch_size]
            yield data[b_indices], labels[b_indices]


In [None]:

batch_size = 33
epochs= len(X_train) // batch_size
# Training
model.fit(data_generator(X_train, y_train, batch_size),
          epochs=7,
          steps_per_epoch=epochs,
          validation_data=(X_test, y_test)) 

In [None]:
# Evaluation
accuracy = model.evaluate(X_test, y_test)[1]
print(f"Accuracy: {accuracy*100}")

In [None]:
y_predicted_value = model.predict(X_test)
y_binary_value = (y_predicted_value > 0.5).astype(int)

In [None]:
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_binary_value, average='binary')

print(f"Precision score: {prec*100:.4f}")
print(f"Recall score: {rec*100:.4f}")
print(f"F1 Score: {f1*100:.4f}")

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_binary_value)

# Confusion Matrix
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Plotting Actual vs Predicted
plt.figure(figsize=(12, 10))
plt.scatter(y_test, y_predicted_value, alpha=0.5)
plt.title('Actual vs Predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

In [None]:
### Bert Super-vised

chunk = 100000
file = "yelp_academic_dataset_review.json"

chunks1 = pd.read_json(file, lines=True, chunksize=chunk)
for i, x in enumerate(chunks1):
    print(f"Processing chunk {i + 1}")
    if i == 0:
        csv = "chunk.csv"
        x.to_csv(csv, index=False)
    df1 = pd.read_csv("chunk.csv")
    break

In [None]:
def sentiment(stars):  
    if 0 <= stars < 3:
        return 0 #negative
    elif 3 <= stars <= 5:
        return 1 #normal
    else:
        return 'undefined'

df1['sentiment_label'] = df1['stars'].apply(sentiment)
le = LabelEncoder()
df1['encoded_sentiment'] = le.fit_transform(df1['sentiment_label'])

# Extract labels
y = df1['encoded_sentiment'].values
df1.to_csv('preprocessed_data1.csv', index=False)

In [None]:
df1

In [None]:
### Selecting divided parts
positive_reviews = df1[df1['sentiment_label'] == 1].sample(n=2000, random_state=42)
negative_reviews = df1[df1['sentiment_label'] == 0].sample(n=2000, random_state=42)

In [None]:
# Combine the selected samples
new_data = pd.concat([positive_reviews, negative_reviews], ignore_index=True)

In [None]:
new_data

In [None]:
#### Load model and tokenizer
from transformers import BertTokenizer, BertForSequenceClassification

model_name = "bert-base-uncased"
# tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)
# Model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)