<a href="https://colab.research.google.com/github/DJ-Manjaray/On-Going-Projects/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from transformers import TFAutoModel, AutoTokenizer, DataCollatorWithPadding, logging

In [None]:
logging.set_verbosity_error()

In [None]:
import gc
gc.collect()

60

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from IPython.display import clear_output
clear_output()

In [None]:
def uniform_soup(model_paths, test_ds, model_fun, evaluate_fun, disable_tqdm = False):
    """
    Returns Uniform Soup model and accuracy on test set
    Args:
    model_paths : List, List of saved model paths
    test_ds : Test Dataset in tfds format.
    model_fun : Fun, Model Instantiating Function
    evaluate_fun : Fun, Model Test Set Evaluation Function
    disable_tqdm : Bool, Wheter to disable TQDM Progress bar or not
    """

    soups = []

    # Instantiating model
    tf.keras.backend.clear_session()
    model = model_fun()

    # Iterating Over all models
    for path in tqdm(model_paths, disable=disable_tqdm):

        # loading model wieghts
        model.load_weights(path)

        # Adding model weights in soup list
        soup = [np.array(weights) for weights in model.weights]
        soups.append(soup)

    # Averaing all weights
    mean_soup = np.array(soups).mean(axis = 0)

    # Replacing model's weight with Unifrom Soup Weights
    for w1, w2 in zip(model.weights, mean_soup ):
        tf.keras.backend.set_value(w1, w2)

    # evaluating uniform soup performance
    accuracy =  evaluate_fun(model, test_ds)
    return model, accuracy

In [None]:
def greedy_soup(model_paths, test_ds, model_fun, evaluate_fun):
    """
    Returns Greedy Soup model and accuracy on test set
    Args:
    model_paths : List, List of saved model paths
    test_ds : Test Dataset in tfds format.
    model_fun : Fun, Model Instantiating Function
    evaluate_fun : Fun, Model Test Set Evaluation Function
    """
    ## Creating intial soup with best performing model
    soups =  [model_paths[0]]

    ## Instantiating model

    tf.keras.backend.clear_session()
    model = model_fun()

    ## Loading best performing model's weights
    model.load_weights(model_paths[0])

    ## Scoirng best performing model on test set
    score_final = evaluate_fun(model,test_ds)

    ## Iterating over the remaining models
    for path in tqdm(model_paths[1:]):

        ## Creating a temp soup
        temp_soup =  soups.copy()
        temp_soup.append(path)

        ## Getting score from temp soup
        model, score = uniform_soup(temp_soup,test_ds,model_fun, evaluate_fun, disable_tqdm= True)

        ## Conditioning current model for appneding in main soup
        ## if score from the temp soup is more than best perofming model
        ## the temp soup path is appended to main soup
        if score > score_final:
            score_final = score
            soups.append(path)

    return model, score_final


In [None]:
# !unzip /content/drive/MyDrive/Dataset/Google_PlayStore_Apps_Reviews.zip -d /content/drive/MyDrive/Dataset/

Archive:  /content/drive/MyDrive/Dataset/Google_PlayStore_Apps_Reviews.zip
replace /content/drive/MyDrive/Dataset/Google_PlayStore_Apps_Reviews/Candy Crush Saga.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
## loading data

file_paths = [os.path.join('/content/drive/MyDrive/Dataset/Google_PlayStore_Apps_Reviews', file) for file in os.listdir('/content/drive/MyDrive/Dataset/Google_PlayStore_Apps_Reviews') if file.endswith('.csv')]

In [None]:
dfs = [pd.read_csv(file) for file in file_paths]
print(type(dfs))
# dfs.dropna(inplace = True)

ParserError: ignored

In [None]:
combined_df = pd.concat(dfs, axis=0)  # Stack vertically

In [None]:
data = combined_df.to_csv('/content/drive/MyDrive/Dataset/Google_PlayStore_Apps_Reviews/data.csv', index=False)  # Save to a new CSV file

In [None]:
# data =  '/content/drive/MyDrive/Dataset/Google_PlayStore_Apps_Reviews/data.csv'

data = pd.read_csv("/content/drive/MyDrive/Dataset/Google_PlayStore_Apps_Reviews/data.csv")

In [None]:
data.head()

Unnamed: 0,reviewId,content,score,app
0,eb1c6f2f-ea1d-4bf0-bab0-f36f00135dbd,Excellent,5,
1,66cc7236-1e0c-46eb-a31d-c2b744f791a2,Good,5,
2,cf0203e6-3960-4a34-8d37-d5125b56bd90,Best,5,
3,8a9d676a-b303-43ea-83f6-86e068cdce82,There is nobutton to go back when i win milest...,3,
4,cf0c43de-d623-4722-b14a-ad5a6510540d,I love candy crush it definitely helps me rela...,5,


In [None]:
data.tail()

Unnamed: 0,reviewId,content,score,app
399995,44ef55cc-1005-4b5e-b2ee-30bbb353cabb,AalaaTariq,1,LINE
399996,b5db958b-eb3d-4d46-a169-457bc39644f5,Calls and notifications are randomly late. Too...,1,LINE
399997,4bc5f13c-70a4-4be5-8d9e-414592ec0df5,Awesome and secure I never worry about my data...,5,LINE
399998,b334af00-2248-4963-ac5e-fcabc61df08c,Can China people use line in their country?,2,LINE
399999,8c9b1dee-1479-4061-9dae-f690a1445277,@ulin.dc,5,LINE


In [None]:
data.dropna(inplace = True)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199985 entries, 200000 to 399999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   reviewId  199985 non-null  object
 1   content   199985 non-null  object
 2   score     199985 non-null  int64 
 3   app       199985 non-null  object
dtypes: int64(1), object(3)
memory usage: 7.6+ MB


In [None]:
data.describe()

Unnamed: 0,score
count,199985.0
mean,3.689537
std,1.699232
min,1.0
25%,2.0
50%,5.0
75%,5.0
max,5.0


In [None]:
## filtering only Spotify App Reviews
reviews = data[data["app"] == "Spotify"]
reviews

Unnamed: 0,reviewId,content,score,app
360000,344e7118-b9dc-41fd-ab81-db272ebd52d1,Excellent interface,5,Spotify
360001,8766869a-c3c4-4022-bb29-32efc86c8c8b,Unsatisfied,1,Spotify
360002,33b7ebf0-e149-4c68-b604-6cbaf337b667,Without premium doesn't allow to choose songs ...,1,Spotify
360003,caf3d24d-f2a3-44ab-881a-093d2944862a,"Últimamente estoy teniendo muchos problemas, s...",2,Spotify
360004,ff1a1d43-9a46-4d07-b4d1-22be3aae7c7a,"IT WONT PLAY THE MUSIC I WANT, I TRIED PLAYING...",1,Spotify
...,...,...,...,...
369995,c27f3903-1f65-4b1a-bc99-847008f2d6a4,Worst app ever,1,Spotify
369996,03f60b6c-dcec-4bca-bb73-cb08f5d016b9,It great but I have an ongoing issue where my ...,4,Spotify
369997,10fc6354-4d12-4491-acdf-ee9eca55dd31,"Unless you buy Premium, it's pretty much just ...",1,Spotify
369998,2227ff47-4bd9-4d98-bf06-80066659d588,I can't do that the song lines repeat,1,Spotify


In [None]:
## converting review scores to binary target
reviews["score"] = reviews["score"].map(lambda x: 0 if x<=3 else 1 )

In [None]:
from transformers import BertTokenizer, TFBertModel

In [None]:
class text_config:
    # MODEL_NAME = TFBertModel.from_pretrained("bert-base-cased")
    # TOKENIZER_NAME = BertTokenizer.from_pretrained('bert-base-cased')
#   tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
#   model = DistilBertModel.from_pretrained("distilbert-base-uncased")
    MODEL_NAME = "distilbert-base-uncased"
    TOKENIZER_NAME = "distilbert-base-uncased"
    MAX_LEN = 64
    BATCH_SIZE = 128
    LOWER_CASE = True
    RANDOM_STATE = 12
    TEST_SIZE = 0.2
    NUM_MODELS = 10

In [None]:
## HuggingFace Dataset API to tokenize data
dataset = Dataset.from_pandas(reviews)
dataset = dataset.rename_columns({"score": "label"})
dataset = dataset.remove_columns(["reviewId", "app"])
dataset = dataset.train_test_split(test_size = text_config.TEST_SIZE, seed = text_config.RANDOM_STATE )
reviews_test_labels = dataset["test"]["label"]

In [None]:
## Instantiating Tokenizer
tokenizer = AutoTokenizer.from_pretrained(text_config.MODEL_NAME,
                                          do_lower_case = text_config.LOWER_CASE)

In [None]:
def tokenize(batch):
    return tokenizer(batch["content"], max_length = text_config.MAX_LEN, padding=True, truncation=True)

In [None]:
!pip install datasets



In [None]:
from datasets import Dataset, DatasetDict

In [None]:
## tokenizing data
dataset = dataset.map(tokenize)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
                                        padding = "max_length",
                                        max_length = text_config.MAX_LEN,
                                        return_tensors="tf")

## Converting huggingface datasets to tfds
train_ds = dataset["train"].to_tf_dataset(
    columns = ["input_ids", "attention_mask"],
    label_cols = ["labels"],
    batch_size = text_config.BATCH_SIZE,
    collate_fn= data_collator,
    shuffle = True
)

## Converting huggingface datasets to tfds
test_ds = dataset["test"].to_tf_dataset(
    columns = ["input_ids", "attention_mask"],
    label_cols = ["labels"],
    batch_size = text_config.BATCH_SIZE,
    collate_fn= data_collator,
    shuffle = False
)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
def create_reviews_model():
    """
    Returns Transformer Model ( Tensorflow )
    """
    input_ids = tf.keras.layers.Input(
            shape=(text_config.MAX_LEN), name="input_ids", dtype=tf.int64
        )
    attention_masks = tf.keras.layers.Input(
                shape=(text_config.MAX_LEN), name="attention_mask", dtype=tf.int64
            )

    ## dowloading pretrained wieghts from huggingface
    bert = TFAutoModel.from_pretrained(text_config.MODEL_NAME)
    out = bert(input_ids,attention_masks)[0]
    out = tf.keras.layers.Dropout(0.3)(out)
    out = tf.keras.layers.Dense(1024)(out)
    out = tf.keras.layers.Dropout(0.3)(out)
    out = tf.keras.layers.Dense(1, activation="sigmoid")(out)

    model = tf.keras.models.Model(
                    inputs=[input_ids, attention_masks], outputs=out
                )
    return model


In [None]:
from keras import optimizers

In [None]:
def reviews_training(train_ds,
                   test_ds,
                   epochs,
                   learning_rate,
                   weight_decay,
                   save_dir = "reviews/"):
    """
    Returns Saved trained model's path and test evaluation score
    Args:
    train_ds : Train Dataset in tfds format.
    test_ds : Test Dataset in tfds format.
    epochs : Int, Trainig Epochs count.
    learning_rate : Float, Training Learning Rate
    weight_decay : Float, AdamW optimizer Weight Decay
    save_dir : Str, Model Save Directory Prefix
    """
    ## Creating directory for saving models
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    ## AdamW Optimizer Setup
    adamw_optimizer = tf.keras.optimizers.AdamW(weight_decay= weight_decay,
                                     learning_rate= learning_rate)

    ## Instantiating model
    model = create_reviews_model()

    ## Compiling Model
    model.compile(
            optimizer = adamw_optimizer,
            loss = "binary_crossentropy",
            metrics = ["accuracy"]
        )

    ## Training Model
    model.fit(
        train_ds,
        epochs = epochs,
        verbose = 0
    )
    ## Evaluating Model
    test_loss , test_score  = model.evaluate(test_ds, verbose = 1 )

    ## Saving Trained Model
    model_save_path = save_dir + "reviews-" +  str(epochs) + "_" + str(learning_rate) +  "_" + str(weight_decay) + ".h5"

    if not os.path.isdir(save_dir):
        model.save_weights(model_save_path)
    else:
        ## if model with same parameter already exists
        model_save_path =save_dir +  "reviews-" +  str(epochs) + "_" + str(learning_rate) +  "_" + str(weight_decay) + str(random.choice(np.arange(0,1000))) +  ".h5"
        model.save_weights(model_save_path)

    ## Clearing GPU memory

    del model
    gc.collect()
    return model_save_path, test_score


In [None]:
def reviews_eval(model,test_ds):
    """
    Returns Accuracy of model on test set
    Args:
    model : Trained tensorflow model
    test_ds : Test dataset for evaluation
    """
    model.compile(loss=  "binary_crossentropy",
                  optimizer = "adam",
                  metrics = ["accuracy"])
    loss, acc = model.evaluate(test_ds, verbose = 0)
    return acc

In [None]:
epochs = [3,4,5]
learning_rate = [1e-5, 2e-5, 5e-5]
weight_decay = [1e-4, 1e-5, 2e-5]

In [None]:
## Creating parameters dictonary for
## hyperparameter tuning
parameters = [ {
    "epochs": random.choice(epochs),
    "learning_rate": random.choice(learning_rate),
    "weight_decay" : random.choice(weight_decay) ,
} for count in range(text_config.NUM_MODELS)]


In [None]:
## creating a dataframe for parameters
reviews_params = pd.DataFrame(parameters)

model_paths = []
test_scores = []

In [None]:
## Training models with different parameters
for params in tqdm(parameters):
    model_save_path, test_score = reviews_training(train_ds,
                                                   test_ds,
                                                 params["epochs"],
                                                 params["learning_rate"],
                                                 params["weight_decay"],
                                                 save_dir = "reviews/")

    model_paths.append(model_save_path)
    test_scores.append(test_score)

  0%|          | 0/10 [00:00<?, ?it/s]

ValueError: ignored

In [None]:
unifrom_soup_model, uniform_soup_acc = uniform_soup(
    reviews_params["paths"].values,
    test_ds,
    create_reviews_model,
    reviews_eval
)
print("Accuracy of Uniform Soup:", uniform_soup_acc )

In [None]:

greedy_soup_model, greedy_soup_acc = greedy_soup(
    reviews_params["paths"].values,
    test_ds,
    create_reviews_model,
    reviews_eval
)
print("Accuracy of Greedy Soup:", greedy_soup_acc)

In [None]:
uniform_soup_pos = reviews_params[reviews_params["scores"].values[::-1] > uniform_soup_acc].index[0] - 0.5

fig, ax = plt.subplots(figsize = (18,5))
plt.plot( reviews_params["scores"].values[::-1], "bo", label = "Individual Models")
plt.plot( uniform_soup_pos,uniform_soup_acc,  marker= "D", color = "green", markersize = 12, label = "Uniform Soup")
plt.plot( len(reviews_params), greedy_soup_acc,  marker= "^", color = "red", markersize = 12, label = "Greedy Soup")
ax.get_xaxis().set_visible(False)
plt.ylabel("Accuracy")
plt.title("Model Soups on Play Store App Reviws Data")
plt.legend();

In [None]:
## saving scores and model paths to dataframe
reviews_params["paths"] = model_paths
reviews_params["scores"] = test_scores




In [None]:
## soring scores in descending order
reviews_params.sort_values(by = "scores", ascending= False, inplace = True)
reviews_params.reset_index(drop = True, inplace = True)



In [None]:
## saving params with respective scores and model paths
reviews_params.to_csv("review_params.csv", index = False)