<a href="https://colab.research.google.com/github/Billal-MOKHTARI/ml_dl_customized_libraries/blob/main/functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

#### Import packages

In [1]:
import pandas as pd
import numpy as np
import os
from google.colab import drive, files
import zipfile
import json
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score

#### Mount to drive

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_access_token

In [None]:
drive.mount('/content/gdrive/', force_remount=True)

#### set kaggle environment

In [23]:
def init_kaggle():
  !pip install -q kaggle
  files.upload()
  !mkdir ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !chmod 600 ~/.kaggle/kaggle.json

In [34]:
def load_kaggle_dataset(dataset_name, path):
    """
    Downloads a Kaggle competition dataset and unzips it to a specified path.
    
    Args:
    - dataset_name (str): the name of the Kaggle dataset (this is the name that appears in the URL)
    - path (str): the path to which the dataset should be downloaded and unzipped
    
    Returns:
    - None
    """
    
    # Set up the API credentials to download the dataset
    # You can find your API credentials by going to your Kaggle account page and clicking "Create New API Token"
    with open("kaggle.json", "r") as f:
            config = json.load(f)
            kaggle_username = config["username"]
            kaggle_key = config["key"]
    

    # kaggle_username = "billalmokhtari"
    # kaggle_key = "48aab974a2bbf5f3c3acc3992aea7c92"
    os.environ["KAGGLE_USERNAME"] = kaggle_username
    os.environ["KAGGLE_KEY"] = kaggle_key
    
    # Make a directory for the dataset
    if not os.path.exists(path):
        os.makedirs(path)
        
    # Download the dataset
    !kaggle competitions download -c {dataset_name} -p {path}
    
    # Unzip the dataset
    zip_file = os.path.join(path, f"{dataset_name}.zip")
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(path)
    
    # Delete the zip file
    os.remove(zip_file)


In [None]:
load_kaggle_dataset("rsna-2022-cervical-spine-fracture-detection", "/content/data")

## Models

### Time series

#### Prophete

Prophet is forecasting procedure implemented in R and Python developped by facebook.

Set up ```Prophete``` environment

In [None]:
!pip3 install prophet 
from prophet import Prophet

Function with which our data will be trained

In [37]:
# Model training
"""
df should have the following format [y, ds]
y : values
ds : time
"""
def time_series_train(df, growth='linear',
                          changepoints=None,
                          n_changepoints=25,
                          changepoint_range=0.8,
                          yearly_seasonality='auto',
                          weekly_seasonality='auto',
                          daily_seasonality='auto',
                          holidays=None,
                          seasonality_mode='additive',
                          seasonality_prior_scale=10.0,
                          holidays_prior_scale=10.0,
                          changepoint_prior_scale=0.05,
                          mcmc_samples=0,
                          interval_width=0.80,
                          uncertainty_samples=1000,
                          stan_backend=None):
  m = Prophet(growth,
              changepoints,
              n_changepoints,
              changepoint_range,
              yearly_seasonality,
              weekly_seasonality,
              daily_seasonality,
              holidays,
              seasonality_mode,
              seasonality_prior_scale,
              holidays_prior_scale,
              changepoint_prior_scale,
              mcmc_samples,
              interval_width,
              uncertainty_samples,
              stan_backend)
  
  model = m.fit(df)

  return m

In [39]:
# Making predictions
"""
m -> Prophet
plot -> boolean : that tells us if we'll plot tha forecasting graphic
plot_components -> boolean : that tells us if we'll plot each component graphic
periods -> integer : the number of the days we'll predict
freq -> char : forecast following Days, Months, Years, Hours, .... When we set periods to 100 with freq='M', that means we
wanna predict 100 months
"""

def time_series_predict(m, plot=True, plot_components=True, periods=100, freq='D', include_history=True):
  future = m.make_future_dataframe(periods=periods, freq=freq, include_history=include_history)
  forecast = m.predict(future)
  
  if plot :
    plot = m.plot(forecast)
  
  if plot_components :
    plot_decompose = m.plot_components(forecast)

  return forecast

### NLP

#### Question answering
[Annotation Tool](https://haystack.deepset.ai/docs/latest/annotationmd)

In [None]:
def set_up_haystack():
  # Make sure you have a GPU running
  !nvidia-smi
  # Install the latest release of Haystack in your own environment
  #! pip install farm-haystack

  # Install the latest master of Haystack
  !pip install --upgrade pip
  !pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

Import model
Have a look on the different models [here](https://huggingface.co)

In [None]:
from haystack.nodes import FARMReader
from haystack import Pipeline, Document
from haystack.utils import print_answers

In [None]:
def train_data(train_path, save_path_drive, nb_epochs=1, save_dir_model="my_model", model="distilbert-base-uncased-distilled-squad", use_gpu=True):
  reader = FARMReader(model_name_or_path=model, use_gpu=use_gpu)
  reader.train(data_dir=data_dir, train_filename=train_path, use_gpu=use_gpu, n_epochs=n_epochs, save_dir=save_dir)
  os.system(f"cp -R {save_dir_model} {save_path_drive}")

  return reader


In [None]:
def evaluate_results(reader, data_path, file_path):
  return reader.eval_on_file(data_path, file_path, device="cuda")


In [None]:
def predict(reader, context, question, with_pipeline=True)
  if not with_pipeline :
    return reader.predict_on_texts(question,[context])
  else:
    p = Pipeline()
    p.add_node(component=new_reader, name="Reader", inputs=["Query"])
    res = p.run(
      query=question documents=[Document(content=context)]
    )
    print_answers(res, details="medium")

    return res


In [None]:
def load_reader(model_path, save_path"):
  # If you want to load it at a later point, just do:
  return FARMReader(model_name_or_path=model_path)

#### Sentiment Analysis

In [None]:
!pip install simpletransformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs

This creates a ClassificationModel that can be used for training, evaluating, and predicting on Binary classification tasks. The first parameter is the model_type, the second is the model_name, and the third is the number of labels in the data.

model_type may be one of `['bert', 'xlnet', 'xlm', 'roberta', 'distilbert']`.

The format of the data should be 

```
review, sentiment
```



Create the model

In [2]:
def nlp_classifier(num_labels, num_train_epochs=2, learning_rate=1e-4, model_type="bert", model_name="bert-base-cased"):
  model_args = ClassificationArgs()
  model_args.num_train_epochs = num_train_epochs
  model_args.learning_rate = learning_rate

  # create model
  model = ClassificationModel(model_type, model_name, num_labels = num_labels, args=model_args)

  return model

Train it

In [3]:
model.train_model(train_df, acc=accuracy_score)

Evaluate it

In [4]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df, acc=accuracy_score)

Make predictions

In [5]:
preds, model_ouputs = model.predict(input)

#### Text generation

In [None]:
!pip install simpletransformers

In [None]:
!nvidia-smi

In [None]:
from simpletransformers.language_modeling import LanguageModelingModel,LanguageModelingArgs

In [None]:
def nlp_generation(model_type="gpt2", model_name="gpt2", train_batch_size=8, num_train_epochs=2, vocab_size=50257):
  # Editing Configurations
  model_args = LanguageModelingArgs()
  model_args.reprocess_input_data = True
  model_args.overwrite_output_dir = True
  model_args.num_train_epochs = num_train_epochs
  model_args.best_model_dir = "outputs/best_model"
  model_args.save_best_model =True
  model_args.train_batch_size = train_batch_size
  model_args.dataset_type = "simple"
  model_args.mlm = False  # mlm must be False for CLM
  model_args.vocab_size = vocab_size

  model = LanguageModelingModel(
    model_type, model_name, args=model_args, train_files=train_file
  )

# Resources
[Hugging Face](https://huggingface.co/)

[Haystack Annotation Tool](https://annotate.deepset.ai/)

Youtube Channel: [karndeepsingh](https://www.youtube.com/@karndeepsingh)

[Simple Transformers](https://simpletransformers.ai/docs/usage/)