In [1]:
import numpy as np # linear algebra
import pandas as pd

In [2]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━

In [3]:

# Importing Libraries

# Data Handling
import pandas as pd
import numpy as np
from datasets import Dataset, load_metric
import shutil

# Data Visualization
import plotly.express as px
import plotly.graph_objs as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.io as pio
from IPython.display import display
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

# Statistics & Mathematics
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import shapiro, skew, anderson, kstest, gaussian_kde,spearmanr
import math

# Transformers
from transformers import BartTokenizer, BartForConditionalGeneration      # BERT Tokenizer and architecture
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments         # These will help us to fine-tune our model
from transformers import pipeline                                         # Pipeline
from transformers import DataCollatorForSeq2Seq                           # DataCollator to batch the data
import torch                                                              # PyTorch
import evaluate                                                           # Hugging Face's library for model evaluation


# Other NLP libraries
from textblob import TextBlob                                             # This is going to help us fix spelling mistakes in texts
from sklearn.feature_extraction.text import TfidfVectorizer               # This is going to helps identify the most common terms in the corpus
import re                                                                 # This library allows us to clean text data
import nltk                                                               # Natural Language Toolkit
nltk.download('punkt')


# Hiding warnings
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
df= pd.read_excel("/content/cleaned_data555.xlsx")
df.head()

Unnamed: 0,egyption_Text,english_Text,category,sub_category
0,الأمير الصغير,little prince,Novels,the-little-prince
1,أنطوان دي سانت إكزوبيري,Antoine De Saint-Exuper,Novels,the-little-prince
2,الفصل الأول,Chapter 1,Novels,the-little-prince
3,في مره، لما كان عندي ست سنين، شفت صوره روعه، ف...,"six years old saw magnificent picture book , c...",Novels,the-little-prince
4,وكانت عباره عن تعبان من نوع البوا بيبلع فهد.,picture boa constrictor act swallowing animal .,Novels,the-little-prince


In [5]:
def clean_tags(text):
    clean = re.compile('<.*?>') # Compiling tags
    clean = re.sub(clean, '', text) # Replacing tags text by an empty string

    # Removing empty dialogues
    clean = '\n'.join([line for line in clean.split('\n') if not re.match('.*:\s*$', line)])

    return clean
def clean_df(df, cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(clean_tags)
    return df

In [6]:
df1=df[["egyption_Text","english_Text"]]
df1.head()

Unnamed: 0,egyption_Text,english_Text
0,الأمير الصغير,little prince
1,أنطوان دي سانت إكزوبيري,Antoine De Saint-Exuper
2,الفصل الأول,Chapter 1
3,في مره، لما كان عندي ست سنين، شفت صوره روعه، ف...,"six years old saw magnificent picture book , c..."
4,وكانت عباره عن تعبان من نوع البوا بيبلع فهد.,picture boa constrictor act swallowing animal .


In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23539 entries, 0 to 23538
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   egyption_Text  23539 non-null  object
 1   english_Text   23207 non-null  object
dtypes: object(2)
memory usage: 367.9+ KB


In [8]:
df1.dropna(inplace=True)

In [9]:
train = clean_df(df1,['egyption_Text', 'english_Text'])

In [10]:
train_ds = Dataset.from_pandas(train)
train_ds

Dataset({
    features: ['egyption_Text', 'english_Text', '__index_level_0__'],
    num_rows: 23207
})

In [11]:
split_datasets = train_ds.train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['egyption_Text', 'english_Text', '__index_level_0__'],
        num_rows: 20886
    })
    test: Dataset({
        features: ['egyption_Text', 'english_Text', '__index_level_0__'],
        num_rows: 2321
    })
})

In [12]:
split_datasets["validation"] = split_datasets.pop("test")
split_datasets

DatasetDict({
    train: Dataset({
        features: ['egyption_Text', 'english_Text', '__index_level_0__'],
        num_rows: 20886
    })
    validation: Dataset({
        features: ['egyption_Text', 'english_Text', '__index_level_0__'],
        num_rows: 2321
    })
})

In [13]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-ar-en"
translator = pipeline("translation", model=model_checkpoint)


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

In [14]:
translator("يارب بس ما تطلعليش سمكه كبيره أوي ويطلع كلامك على فشوش .")

[{'translation_text': 'O LORD, but you shall not have a great fish, Oi, and let thy words be known to the gushing.'}]

In [15]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [16]:
en_sentence = split_datasets["train"][1]["english_Text"]
ar_sentence = split_datasets["train"][1]["egyption_Text"]

inputs = tokenizer(ar_sentence, text_target=en_sentence)
inputs

{'input_ids': [61, 14512, 62, 9, 2583, 218, 4012, 41, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [22512, 1214, 23, 2, 0]}

In [17]:
wrong_targets = tokenizer(en_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁be', 'au', 'ty', '▁c', 'all', 'ed', '▁.', '</s>']
['▁beauty', '▁called', '▁', '.', '</s>']


In [18]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex for ex in examples["egyption_Text"]]
    targets = [ex for ex in examples["english_Text"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [19]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map:   0%|          | 0/20886 [00:00<?, ? examples/s]

Map:   0%|          | 0/2321 [00:00<?, ? examples/s]

In [20]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [21]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [22]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [23]:
batch["labels"]

tensor([[22512,  1214,    23,     2,     0,  -100],
        [  438,  8373,   113,    23,     2,     0]])

In [24]:
batch["decoder_input_ids"]

tensor([[62833, 22512,  1214,    23,     2,     0],
        [62833,   438,  8373,   113,    23,     2]])

In [25]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[22512, 1214, 23, 2, 0]
[438, 8373, 113, 23, 2, 0]


In [26]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20886
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2321
    })
})

In [27]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.1-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/106.6 kB[0m [31m970.4 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.1


In [28]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [29]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [30]:
outputdir="/kaggle"

In [31]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    output_dir=outputdir,
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    #push_to_hub=True,
    report_to="none",
)

In [32]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #model_name="Helsinki-NLP/opus-mt-en-fr",  # Specify the correct model name
    #repo_type="model",  # Set the repo type to "model"
)

In [33]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 5.201899528503418,
 'eval_bleu': 1.0996134930893562,
 'eval_runtime': 158.2623,
 'eval_samples_per_second': 14.666,
 'eval_steps_per_second': 0.234}

In [34]:
trainer.train()

Step,Training Loss
500,2.8391
1000,2.3723
1500,2.2107


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


TrainOutput(global_step=1959, training_loss=2.3917765984917367, metrics={'train_runtime': 243.4332, 'train_samples_per_second': 257.393, 'train_steps_per_second': 8.047, 'total_flos': 741677156794368.0, 'train_loss': 2.3917765984917367, 'epoch': 3.0})

In [35]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 2.261841297149658,
 'eval_bleu': 10.613972799016778,
 'eval_runtime': 119.4799,
 'eval_samples_per_second': 19.426,
 'eval_steps_per_second': 0.31,
 'epoch': 3.0}

In [37]:
translator = pipeline("translation", model="/kaggle/checkpoint-1306")
translator("يارب بس ما تطلعليش سمكه كبيره أوي ويطلع كلامك على فشوش .")

[{'translation_text': "God , n't get big fish , n't tell anything ."}]

In [38]:
translator(" وحد الواحد")

[{'translation_text': 'one .'}]

In [39]:
translator(" تشكر منحرمش")

[{'translation_text': 'Thank .'}]

In [40]:
translator(" انا ابن اصول وافهمها وهي طايره")

[{'translation_text': "'m son , understand flying ."}]

In [41]:
translator("  السلام عليكم")

[{'translation_text': 'Hello .'}]

In [42]:
translator("  شوف جايه معاك ب اي واخوك سداد")

[{'translation_text': 'See , brother pay .'}]