Run all cells

In [60]:

import pandas as pd
import numpy as np
import random
import os
import torch
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

# natural language
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [61]:
# seeding

seed = 42
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False



**LOAD DATA**

In [62]:
df = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")


In [63]:
sub = pd.DataFrame({
    "ID" : test["ID"]
})

**PRE-PROCESSING FUNCTIONS**

In [64]:
# chat words conversion




In [65]:

# remove all the punctuations
import string
PUN = string.punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('','', PUN))



In [66]:
from collections import Counter
cnt = Counter()
for text in df["Tweets"].values:
  for word in text.split():
    cnt[word] += 1

In [67]:
cnt.most_common(10)

[('ya', 1472),
 ('na', 1314),
 ('wa', 967),
 ('kwa', 877),
 ('ni', 405),
 ('za', 284),
 ('kwenye', 221),
 ('sana', 212),
 ('la', 212),
 ('katika', 202)]

In [68]:
# removing freqnuent and rare words



cnt.most_common(10)
FREQ = set([ w for (w, wc) in cnt.most_common(10)])
def remove_freq(text):
  return " ".join([word for word in str(text).split() if word not in FREQ])


In [69]:
n_rare_words = 10
RARE = set([w for (w, wc) in cnt.most_common()[: -n_rare_words -1: -1]])
def remove_rare(text):
  return " ".join([word for word in str(text).split() if word not in RARE])

In [70]:
# removal of numbers

def remove_numbers(text):
  number_pattern = r'\d+'
  without_number = re.sub(pattern = number_pattern, repl = " ", string = text)
  return without_number



In [71]:
# removal of extra spaces

def remove_extra_space(text):
  single_char_pattern = r'\s+[a-zA-Z]\s+'
  without_sc = re.sub(pattern = single_char_pattern, repl = " ", string = text)
  return without_sc


In [72]:
# spelling correction


from textblob import TextBlob



In [73]:
# removal of english stopwords

def remove_stopwords(text):
  removed = []
  stop_words = list(stopwords.words("english"))
  tokens = word_tokenize(text)
  for i in range(len(tokens)):
    if tokens[i] not in stop_words:
      removed.append(tokens[i])
  return " ".join(removed)


In [74]:
# removal of HTML tags

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [75]:
# english lemmatisation





In [76]:
# word-character trimming

#def character_trimming(text):



In [77]:
# removing accented character
import unicodedata
def remove_accented_char(text):
  return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

**APPLY ALL THE PRE-PROCESSING FUNCTIONS**

***on train***

In [78]:
# changing the string to lower
df["Tweets"] = df["Tweets"].str.lower()

# removal of hyperlinks
df["Tweets"] = df["Tweets"].str.replace(r"http\s+", "", regex = True)

# remove all punctuations
df["Tweets"] = df["Tweets"].apply(lambda text: remove_punctuation(text))

# removing the frequent words
df["Tweets"] = df["Tweets"].apply(lambda text: remove_freq(text))

# removal of rare words
df["Tweets"] = df["Tweets"].apply(lambda text: remove_rare(text) )

# removal of numbers
df["Tweets"] = df["Tweets"].apply(lambda text: remove_numbers(text))

# removal of extra spaces
df["Tweets"] = df["Tweets"].apply(lambda text: remove_extra_space(text))

# removal of stopwords
#df["Tweets"] = df["Tweets"].apply(lambda text: remove_stopwords(text))

# removal of html tags
df["Tweets"] = df["Tweets"].apply(lambda text: remove_html_tags(text))

# spelling correction
#df["Tweets"] = df["Tweets"].apply(lambda x: str(TextBlob(x).correct()))

# remove accented letters
#df["Tweets"] = df["Tweets"].apply(lambda text: remove_accented_char(text))


In [79]:
df.head()

Unnamed: 0,ID,Tweets,Labels
0,2,so chuga si tunakutana kesho nyamachoma festiv...,0
1,3,asante watu sirari jimbo tarime vijijini huu u...,1
2,5,leo nimepata kitambulisho changu cha taifa asante,1
3,6,mgema akisifiwa tembo hulitia maji,0
4,8,ee mwenyezi mungu msamehe umrehemu umuafu msam...,1


***on test***

In [80]:
# changing the string to lower
test["Tweets"] = test["Tweets"].str.lower()

# removal of hyperlinks
test["Tweets"] = test["Tweets"].str.replace(r"http\s+", "", regex = True)

# remove all punctuations
test["Tweets"] = test["Tweets"].apply(lambda text: remove_punctuation(text))

# removing the frequent words
test["Tweets"] = test["Tweets"].apply(lambda text: remove_freq(text))

# removal of rare words
test["Tweets"] = test["Tweets"].apply(lambda text: remove_rare(text) )

# removal of numbers
test["Tweets"] = test["Tweets"].apply(lambda text: remove_numbers(text))

# removal of extra spaces
test["Tweets"] = test["Tweets"].apply(lambda text: remove_extra_space(text))

# removal of stopwords
#test["Tweets"] = test["Tweets"].apply(lambda text: remove_stopwords(text))

# removal of html tags
test["Tweets"] = test["Tweets"].apply(lambda text: remove_html_tags(text))

# spelling correction
#test["Tweets"] = test["Tweets"].apply(lambda x: str(TextBlob(x).correct()))

# remove accented letters
#test["Tweets"] = test["Tweets"].apply(lambda text: remove_accented_char(text))

In [81]:
test.head()

Unnamed: 0,ID,Tweets
0,4,maandalizi wadau kutoka kamati maandalizi waja...
1,7,tunashukuru mrejesho wako kuhusiana huduma zet...
2,9,asante kutembelea kurasa yetu cm
3,10,habari asante kutuandikia kupitia kurasa zetu ...
4,15,kunawa sio wakati kula tutujiwekee taratibu ku...


**OverSampling**

In [82]:
! pip install imbalanced-learn



In [83]:
# handling the imbalance
'''
import imblearn
from imblearn.over_sampling import SMOTE


X = df["Tweets"]
y = df["Labels"]
over = SMOTE()
X_t, y_t = over.fit_resample(X, y)
df['Labels'] = y
df['Tweets'] = X
'''

'\nimport imblearn\nfrom imblearn.over_sampling import SMOTE\n\n\nX = df["Tweets"]\ny = df["Labels"]\nover = SMOTE()\nX_t, y_t = over.fit_resample(X, y)\ndf[\'Labels\'] = y\ndf[\'Tweets\'] = X\n'

*Transform and Save Dataset*

In [84]:
# Transform the values
target_map = {1:1, 0:0, -1:2}
df['Labels'] = df['Labels'].map(target_map)
df.to_csv("new_data.csv", index = False)

In [85]:
df['Labels'].value_counts()

0    1340
1     684
2     239
Name: Labels, dtype: int64

In [86]:
!pip install datasets



In [87]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files="new_data.csv")
dataset.num_rows

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

{'train': 2263}

In [88]:
# remove the ID colum
#dataset = dataset.remove_columns("ID")


In [89]:
# remane the target column
dataset = dataset.rename_column("Labels", "label")

In [90]:
split = dataset['train'].train_test_split(test_size=0.2, seed=42, shuffle = True)
split

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweets', 'label'],
        num_rows: 1810
    })
    test: Dataset({
        features: ['ID', 'Tweets', 'label'],
        num_rows: 453
    })
})

**LANGUAGE MODEL**

In [91]:
!pip install transformers



In [92]:
from transformers import set_seed
set_seed(42)

In [93]:
import torch
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
#from wechsel import WECHSEL, load_embeddings
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

In [94]:
import transformers

In [95]:
from transformers import pipeline
# checkpoint = 'Davlan/bert-base-multilingual-cased-finetuned-swahili'

from transformers import AutoTokenizer, AutoModelForMaskedLM


tokernizer = AutoTokenizer.from_pretrained("benjamin/roberta-base-wechsel-swahili")
model = AutoModelForSequenceClassification.from_pretrained("benjamin/roberta-base-wechsel-swahili", num_labels = 3)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-swahili and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**TRAINING, EVALUATION**

In [96]:
from transformers import(
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [97]:
def tokenize_fn(batch):
  return tokernizer(batch['Tweets'], truncation = True)

# Encode the input data
tokenized_dataset = split.map(tokenize_fn, batched = True)



Map:   0%|          | 0/1810 [00:00<?, ? examples/s]

Map:   0%|          | 0/453 [00:00<?, ? examples/s]

In [98]:
!pip install accelerator



In [99]:
!pip install transformers[torch]



In [100]:
!pip install accelerate -U



In [101]:
!pip install git+https://github.com/huggingface/accelerate

Collecting git+https://github.com/huggingface/accelerate
  Cloning https://github.com/huggingface/accelerate to /tmp/pip-req-build-4zhvutet
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate /tmp/pip-req-build-4zhvutet
  Resolved https://github.com/huggingface/accelerate to commit a87c95da9e3b416fb10a0e7dac7d397c015c3ed5
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


Please restart and run all

In [102]:
# as we updated our accelerate above, we need to restart the runtime and run all the cells
# RESTART(RUNTIME) AND RUN ALL

training_args = TrainingArguments(
    output_dir='training_dir',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate = 1e-4,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = './logs',
    seed = 42
)
#training_args.set_seed(42)

In [103]:
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  acc = np.mean(predictions == labels)
  f1 = f1_score(labels, predictions, average = 'micro')
  return {'accuracy': acc, 'f1_score': f1}

In [104]:
trainer = Trainer(model,
                  training_args,
                  train_dataset = tokenized_dataset["train"],
                  eval_dataset = tokenized_dataset["test"],
                  tokenizer=tokernizer,
                  compute_metrics=compute_metrics,
                  # The next line is important to ensure the dataset labels are properly passed to the model
                  #remove_unused_columns=False
                  )

In [105]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,No log,0.823098,0.633554,0.633554
2,No log,0.722698,0.664459,0.664459
3,No log,0.829439,0.598234,0.598234


TrainOutput(global_step=342, training_loss=0.8076918752569902, metrics={'train_runtime': 91.9528, 'train_samples_per_second': 59.052, 'train_steps_per_second': 3.719, 'total_flos': 109690544949300.0, 'train_loss': 0.8076918752569902, 'epoch': 3.0})

In [106]:
! ls training_dir

checkpoint-114	checkpoint-228	checkpoint-342


In [107]:
trainer.evaluate()

{'eval_loss': 0.8294394612312317,
 'eval_accuracy': 0.5982339955849889,
 'eval_f1_score': 0.5982339955849889,
 'eval_runtime': 1.0898,
 'eval_samples_per_second': 415.682,
 'eval_steps_per_second': 7.341,
 'epoch': 3.0}

**PIPELINE, PREDICTION, SUBMIT**

In [108]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis",
                      model = "training_dir/checkpoint-228")

**PREDICTIONS**

In [109]:
test.to_csv("new_test.csv", index = False)

In [110]:
new_test = load_dataset("csv", data_files ="new_test.csv")
new_test

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweets'],
        num_rows: 755
    })
})

In [111]:
# drop the id
new_test = new_test.remove_columns("ID")
new_test


DatasetDict({
    train: Dataset({
        features: ['Tweets'],
        num_rows: 755
    })
})

In [112]:
# tokenize the testing set
tokenized_test = new_test.map(tokenize_fn, batched = True)

Map:   0%|          | 0/755 [00:00<?, ? examples/s]

In [113]:
model.to('cpu')

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [114]:
def get_label(d):
  return int(d['label'].split('_')[1])
#predictions = [get_label(d) for d in prediction]

In [115]:
pred = classifier.predict(tokenized_test['train']['Tweets'])
pred = [get_label(d) for d in pred]

In [116]:
# change back the classes
sub['label'] = pred
sub_map = {1:1, 0:0, 2:-1}
sub['label'] = sub['label'].map(sub_map)

In [117]:
# Making and Saving Predictions to a file which can then be submitted

sub.to_csv('7_is_a_lucky_number_my_friend.csv', index=False)