In [None]:
import ray

In [None]:
if ray.is_initialized():
    ray.shutdown()
ray.init()

In [None]:
ray.cluster_resources()

In [None]:
import pandas as pd

In [None]:
# Data Ingestion
DATASET_LOC = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/dataset.csv"
df=pd.read_csv(DATASET_LOC)
df.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# df.tag.value_counts()

In [None]:
# split dataset
# test_size=0.2
# train_df,val_df=train_test_split(df,test_size=test_size,random_state=1234)

In [None]:
# train_df.tag.value_counts()

In [None]:
# val_df.tag.value_counts() * int((1-test_size)/test_size)

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns;sns.set_theme()
import warnings; warnings.filterwarnings("ignore")
from wordcloud import WordCloud, STOPWORDS

In [None]:
# all_tags=Counter(df.tag)
# all_tags.most_common()

In [None]:
# tags, tag_counts = zip(*all_tags.most_common())
# plt.figure(figsize=(10, 3))
# ax = sns.barplot(x=list(tags), y=list(tag_counts))
# ax.set_xticklabels(tags, rotation=0, fontsize=8)
# plt.title("Tag distribution", fontsize=14)
# plt.ylabel("# of projects", fontsize=12)
# plt.show()

In [None]:
# Most frequent tokens for each tag
# tag = "natural-language-processing"
# plt.figure(figsize=(10, 3))
# subset = df[df.tag == tag]
# text = subset.title.values
# cloud = WordCloud(
#     stopwords=STOPWORDS,
#     background_color="black",
#     collocations=False,
#     width=500,
#     height=300,
# ).generate(" ".join(text))
# plt.axis("off")
# plt.imshow(cloud)

In [None]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [None]:
nltk.download("stopwords")
STOPWORDS=stopwords.words("english")

In [None]:
def clean_text(text, stopwords=STOPWORDS):
    """Clean raw text string."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Spacing and filters
    text = re.sub(
        r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text
    )  # add spacing
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()  # strip white space at the ends
    text = re.sub(r"http\S+", "", text)  #  remove links

    return text

In [None]:
# def decode(indices,index_to_class):
#     return [index_to_class[i] for i in indices]

In [None]:
# index_to_class={v:k for k,v in class_to_index.items()}
# decode(df.head()["tag"].values,index_to_class)

In [None]:
import numpy as np
from transformers import BertTokenizer

In [None]:
def tokenize(batch):
    tokenizer = BertTokenizer.from_pretrained(
        "allenai/scibert_scivocab_uncased", return_dict=False
    )
    encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
    return dict(ids=encoded_inputs["input_ids"], mask=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))

In [None]:
def preprocess(df,class_to_index):
    """Preprocess the data."""
    df["text"] = df.title + " " + df.description  # feature engineering
    df["text"] = df.text.apply(clean_text)  # clean text
    df = df.drop(
        columns=["id", "created_on", "title", "description"], errors="ignore"
    )  # clean dataframe
    df = df[["text", "tag"]]  # rearrange columns
    df["tag"] = df["tag"].map(class_to_index)  # label encoding
    outputs = tokenize(df)
    return outputs

In [None]:
# preprocess(train_df,class_to_index)

In [None]:
ds = ray.data.read_csv(DATASET_LOC)
ds=ds.random_shuffle(seed=1234)
ds.take(1)

In [None]:
from ray.data import Dataset
from typing import Dict, List, Tuple

In [None]:
def stratify_split(
    ds: Dataset,
    stratify: str,
    test_size: float,
    shuffle: bool = True,
    seed: int = 1234,
) -> Tuple[Dataset, Dataset]:
    """Split a dataset into train and test splits with equal
    amounts of data points from each class in the column we
    want to stratify on.

    Args:
        ds (Dataset): Input dataset to split.
        stratify (str): Name of column to split on.
        test_size (float): Proportion of dataset to split for test set.
        shuffle (bool, optional): whether to shuffle the dataset. Defaults to True.
        seed (int, optional): seed for shuffling. Defaults to 1234.

    Returns:
        Tuple[Dataset, Dataset]: the stratified train and test datasets.
    """

    def _add_split(
        df: pd.DataFrame,
    ) -> pd.DataFrame:  # pragma: no cover, used in parent function
        """Naively split a dataframe into train and test splits.
        Add a column specifying whether it's the train or test split."""
        train, test = train_test_split(
            df, test_size=test_size, shuffle=shuffle, random_state=seed
        )
        train["_split"] = "train"
        test["_split"] = "test"
        return pd.concat([train, test])

    def _filter_split(
        df: pd.DataFrame, split: str
    ) -> pd.DataFrame:  # pragma: no cover, used in parent function
        """Filter by data points that match the split column's value
        and return the dataframe with the _split column dropped."""
        return df[df["_split"] == split].drop("_split", axis=1)

    # Train, test split with stratify
    grouped = ds.groupby(stratify).map_groups(
        _add_split, batch_format="pandas"
    )  # group by each unique value in the column we want to stratify on
    train_ds = grouped.map_batches(
        _filter_split, fn_kwargs={"split": "train"}, batch_format="pandas"
    )  # combine
    test_ds = grouped.map_batches(
        _filter_split, fn_kwargs={"split": "test"}, batch_format="pandas"
    )  # combine

    # Shuffle each split (required)
    train_ds = train_ds.random_shuffle(seed=seed)
    test_ds = test_ds.random_shuffle(seed=seed)

    return train_ds, test_ds

In [None]:
test_size=0.2
train_ds,val_ds = stratify_split(ds,stratify="tag",test_size=test_size)

In [None]:
tags=train_ds.unique(column="tag")
# print(tags)
class_to_index={tag:i for i,tag in enumerate(tags)}

In [None]:
simple_ds=train_ds.map_batches(
    preprocess,
    fn_kwargs={"class_to_index":class_to_index},
    batch_format="pandas"
)
simple_ds.show(1)