In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Softmax

from typing import List, Optional, Tuple, Union, Dict, Any

from datasets import load_dataset, Dataset, DatasetDict, load_metric, load_from_disk, concatenate_datasets
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from transformers import PreTrainedModel, TrainingArguments, Trainer
#from transformers.adapters import AdapterTrainer

import pandas as pd
import numpy as np
import evaluate

import random
import math
import time
from tqdm import tqdm
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
_numpy_rng = np.random.default_rng(seed)
random.seed(seed)
np.random.seed(seed)
torch.use_deterministic_algorithms(False)
os.environ['PYTHONHASHSEED'] = str(seed)

In [3]:
with open('./monolingual-data/hau', 'r') as file:
    data = file.read()

In [6]:
with open('./monolingual-data/hau', 'r') as file:
    for i in range(5):
        data1 = file.readline()
        print(data1)

Lokacin da za ku buga bugun bugawa, za a tambayi ku ko kuna son wani yanki mai zaman kansa kyauta ko wani yanki mai suna.

Sannan Ahmad ibn Hanbal ya ce: da za’a karanta wannan isandi kan mahaukaci da take ya warke daga cutar hauka.

Labarin "mummunan labari" na tsawon lokaci game da yara kimanin 40 a cikin ciki ya dade daɗewa.

Muna ɗaure shi da jaka kuma rataya shi a kalla a rana a kan akwati mai dacewa.

Yana da shawara cewa an yi amfani da takin gargajiya ko ma'adinai don amfani da shafin yayin digging.



In [7]:
sentences = data.split('\n')

In [8]:
sentences[:10]

['Lokacin da za ku buga bugun bugawa, za a tambayi ku ko kuna son wani yanki mai zaman kansa kyauta ko wani yanki mai suna.',
 'Sannan Ahmad ibn Hanbal ya ce: da za’a karanta wannan isandi kan mahaukaci da take ya warke daga cutar hauka.',
 'Labarin "mummunan labari" na tsawon lokaci game da yara kimanin 40 a cikin ciki ya dade daɗewa.',
 'Muna ɗaure shi da jaka kuma rataya shi a kalla a rana a kan akwati mai dacewa.',
 "Yana da shawara cewa an yi amfani da takin gargajiya ko ma'adinai don amfani da shafin yayin digging.",
 'Yana da mahimmanci kada kuyi canjin saurin sauƙi, amma tsabtace hanyoyi, gano bakan gizo.',
 'Jim kadan da bayyana wannan sako ne sai shugaba Trump ya bayyana shi a shafinsa na twiter.',
 "Laser yankan tube kayan ne wani sabon tsari wanda ya zama mafi rare a 'yan shekarun nan.",
 'Ya ce Sheriff na tafe ne da tawagar kasaitattun motoci guda tara a jere.',
 'Kadan daga cikin abin da yasa hakan shi ne: hikimar sarrafa hannu da aiki da shi tana komawa ga kafafuwan nasu

In [6]:
len(sentences)

3520671

In [7]:
df = pd.DataFrame({'ha': sentences})

In [8]:
df.to_csv('./monolingual-data/ha_monolingual.csv', index=False)

In [9]:
data = pd.read_csv('./monolingual-data/ha_monolingual.csv')

In [10]:
data.head()

Unnamed: 0,ha
0,"Lokacin da za ku buga bugun bugawa, za a tamba..."
1,Sannan Ahmad ibn Hanbal ya ce: da za’a karanta...
2,"Labarin ""mummunan labari"" na tsawon lokaci gam..."
3,Muna ɗaure shi da jaka kuma rataya shi a kalla...
4,Yana da shawara cewa an yi amfani da takin gar...


In [11]:
len(data)

3520671

In [12]:
def clean_csv(csv_file):
    # Clean the data
    df = pd.read_csv(csv_file)
    df = df.dropna()
    df = df.drop_duplicates()
    # drop rows with first column value 'English'
    df = df[df.iloc[:,0] != 'English']
    # # drop rows with just '.' or ',' or '?' in first column
    df = df[df.iloc[:,0] != '.']
    df = df[df.iloc[:,0] != ',']
    df = df[df.iloc[:,0] != '?']
    df = df.reset_index(drop=True)
    return df

In [13]:
data_cleaned = clean_csv('./monolingual-data/ha_monolingual.csv')

In [14]:
all_dataset = Dataset.from_dict({'ha': data_cleaned['ha']})
small_dataset = all_dataset.shuffle(seed=seed).select(range(100000))

In [18]:
dataset = DatasetDict({'complete': all_dataset, 'small': small_dataset})

In [19]:
dataset

DatasetDict({
    complete: Dataset({
        features: ['ha'],
        num_rows: 3372487
    })
    small: Dataset({
        features: ['ha'],
        num_rows: 100000
    })
})

In [20]:
dataset.push_to_hub("monolingual-ha")

Creating parquet from Arrow format: 100%|██████████| 3373/3373 [00:03<00:00, 956.77ba/s] 
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.60s/it]
Creating parquet from Arrow format: 100%|██████████| 100/100 [00:01<00:00, 62.21ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.84s/it]
README.md: 100%|██████████| 398/398 [00:00<00:00, 55.3kB/s]


In [None]:
len(data_cleaned)

In [None]:
data_cleaned.to_csv('./monolingual-data/ha_cleaned_monolingual.csv', index=False)

In [None]:
sent = data_cleaned['ha'].to_list()

In [None]:
sentence_lengths = [len(sentence.split()) for sentence in sentences]

longest_sentence = sent[sentence_lengths.index(max(sentence_lengths))]
shortest_sentence = sent[sentence_lengths.index(min(sentence_lengths))]

average_sentence_length = sum(sentence_lengths) / len(sentence_lengths)

print(f"Longest Sentence: {longest_sentence}", len(longest_sentence.split()))
print(f"Shortest Sentence: {shortest_sentence}", len(shortest_sentence.split()))
print(f"Average Sentence Length: {average_sentence_length}")