In [62]:
#%pip install -q huggingface_hub
#%pip install -q datasets
#%pip install -q transformers
#%pip install -q accelerate
#%pip install -q gradio
#%pip install -q scikit-learn

In [63]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/SampleSubmission.csv')

In [64]:
train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [65]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [66]:
duplicates = train.duplicated(subset=['safe_text'])
print(f"Number of duplicates in safe_text col:{duplicates.sum()}")

Number of duplicates in safe_text col:343


In [67]:
#remove duplicates
train = train[~duplicates]
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9658 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   9658 non-null   object 
 1   safe_text  9658 non-null   object 
 2   label      9657 non-null   float64
 3   agreement  9656 non-null   float64
dtypes: float64(2), object(2)
memory usage: 377.3+ KB


In [68]:
train['safe_text']

0        Me &amp; The Big Homie meanboy3000 #MEANBOY #M...
1        I'm 100% thinking of devoting my career to pro...
2        #whatcausesautism VACCINES, DO NOT VACCINATE Y...
3        I mean if they immunize my kid with something ...
4        Thanks to <user> Catch me performing at La Nui...
                               ...                        
9996     Living in a time where the sperm I used to was...
9997     <user> <user>  In spite of all measles outbrea...
9998     Interesting trends in child immunization in Ok...
9999     CDC Says Measles Are At Highest Levels In Deca...
10000    Pneumonia vaccine: for women w risk of pulmona...
Name: safe_text, Length: 9658, dtype: object

In [69]:
train['safe_text']

0        Me &amp; The Big Homie meanboy3000 #MEANBOY #M...
1        I'm 100% thinking of devoting my career to pro...
2        #whatcausesautism VACCINES, DO NOT VACCINATE Y...
3        I mean if they immunize my kid with something ...
4        Thanks to <user> Catch me performing at La Nui...
                               ...                        
9996     Living in a time where the sperm I used to was...
9997     <user> <user>  In spite of all measles outbrea...
9998     Interesting trends in child immunization in Ok...
9999     CDC Says Measles Are At Highest Levels In Deca...
10000    Pneumonia vaccine: for women w risk of pulmona...
Name: safe_text, Length: 9658, dtype: object

In [70]:
#function to clean the text data
#import re
#from nltk.corpus import stopwords
#import nltk

# Ensure the stopwords dataset is downloaded
#nltk.download('stopwords')

#def clean_text(text):
    # Remove unwanted characters and symbols
    #text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove URLs
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove extra spaces
    #text = text.replace('\n', ' ').replace('\r', ' ')
    # Remove leading and trailing spaces
    #text = text.strip()
    # Convert to lowercase
    #text = text.lower()
    # Remove special characters 
    #text = re.sub(r'\W+', ' ', text)

    #return text

#### Transformers using RoBERTa

##### The RoBERTa tokenizer is a subword tokenizer that uses byte pair encoding (BPE) to handle out-of-vocabulary words.    

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
#function to tokenize the text data
def tokenize_function(data):
    return tokenizer(data["safe_text"], padding="max_length", truncation=True, max_length=512)

In [73]:
# applying the tokenize function to the train and test data
train_tokenized = train["safe_text"].apply(lambda x: tokenize_function({"safe_text": x}))

In [74]:
# Convert the tokenized data to a format suitable for the model
train_tokenized = pd.DataFrame(train_tokenized.tolist())

In [75]:
train_tokenized.head()

Unnamed: 0,attention_mask,input_ids
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 5096, 359, 3914, 131, 20, 1776, 11858, 324..."
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 100, 437, 727, 207, 2053, 9, 8709, 12653, ..."
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 10431, 12196, 3245, 9764, 4255, 1809, 468,..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 100, 1266, 114, 51, 13998, 2072, 127, 4607..."
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 22086, 7, 28696, 12105, 15698, 15568, 162,..."


In [76]:
#drop null values
train_tokenized = train_tokenized.dropna()

In [77]:
#drop numll values in labels column
train["label"].dropna(inplace=True)

In [78]:
train['label']

0        0.0
1        1.0
2       -1.0
3       -1.0
4        0.0
        ... 
9996     1.0
9997     1.0
9998     0.0
9999     0.0
10000    1.0
Name: label, Length: 9658, dtype: float64

In [79]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9658 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   9658 non-null   object 
 1   safe_text  9658 non-null   object 
 2   label      9657 non-null   float64
 3   agreement  9656 non-null   float64
dtypes: float64(2), object(2)
memory usage: 377.3+ KB


In [80]:
train.isna().sum()

tweet_id     0
safe_text    0
label        1
agreement    2
dtype: int64

In [81]:
#remove rows with null values in the train data
train = train.dropna()
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9656 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   9656 non-null   object 
 1   safe_text  9656 non-null   object 
 2   label      9656 non-null   float64
 3   agreement  9656 non-null   float64
dtypes: float64(2), object(2)
memory usage: 377.2+ KB


In [82]:
train_tokenized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9658 entries, 0 to 9657
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   attention_mask  9658 non-null   object
 1   input_ids       9658 non-null   object
dtypes: object(2)
memory usage: 151.0+ KB
