In [17]:
#%pip install -q huggingface_hub
#%pip install -q datasets
#%pip install -q transformers
#%pip install -q accelerate
#%pip install -q gradio
#%pip install -q scikit-learn

In [18]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/SampleSubmission.csv')

In [19]:
train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [21]:
duplicates = train.duplicated(subset=['safe_text'])
print(f"Number of duplicates in safe_text col:{duplicates.sum()}")

Number of duplicates in safe_text col:343


In [22]:
#remove duplicates
train = train[~duplicates]
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9658 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   9658 non-null   object 
 1   safe_text  9658 non-null   object 
 2   label      9657 non-null   float64
 3   agreement  9656 non-null   float64
dtypes: float64(2), object(2)
memory usage: 377.3+ KB


In [23]:
train['safe_text']

0        Me &amp; The Big Homie meanboy3000 #MEANBOY #M...
1        I'm 100% thinking of devoting my career to pro...
2        #whatcausesautism VACCINES, DO NOT VACCINATE Y...
3        I mean if they immunize my kid with something ...
4        Thanks to <user> Catch me performing at La Nui...
                               ...                        
9996     Living in a time where the sperm I used to was...
9997     <user> <user>  In spite of all measles outbrea...
9998     Interesting trends in child immunization in Ok...
9999     CDC Says Measles Are At Highest Levels In Deca...
10000    Pneumonia vaccine: for women w risk of pulmona...
Name: safe_text, Length: 9658, dtype: object

In [24]:
train['safe_text']

0        Me &amp; The Big Homie meanboy3000 #MEANBOY #M...
1        I'm 100% thinking of devoting my career to pro...
2        #whatcausesautism VACCINES, DO NOT VACCINATE Y...
3        I mean if they immunize my kid with something ...
4        Thanks to <user> Catch me performing at La Nui...
                               ...                        
9996     Living in a time where the sperm I used to was...
9997     <user> <user>  In spite of all measles outbrea...
9998     Interesting trends in child immunization in Ok...
9999     CDC Says Measles Are At Highest Levels In Deca...
10000    Pneumonia vaccine: for women w risk of pulmona...
Name: safe_text, Length: 9658, dtype: object

In [25]:
#function to clean the text data
import re
from nltk.corpus import stopwords
import nltk

# Ensure the stopwords dataset is downloaded
nltk.download('stopwords')

def clean_text(text):
    # Remove unwanted characters and symbols
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove extra spaces
    text = text.replace('\n', ' ').replace('\r', ' ')
    # Remove leading and trailing spaces
    text = text.strip()
    # Convert to lowercase
    text = text.lower()
    # Remove special characters 
    text = re.sub(r'\W+', ' ', text)

    return text

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [26]:
#call the function to clean the text data``
train['safe_text'] = train['safe_text'].apply(clean_text)
train['safe_text']

0        me amp the big homie meanboy3000 meanboy mb mb...
1        im 100 thinking of devoting my career to provi...
2        whatcausesautism vaccines do not vaccinate you...
3        i mean if they immunize my kid with something ...
4        thanks to user catch me performing at la nuit ...
                               ...                        
9996     living in a time where the sperm i used to was...
9997     user user in spite of all measles outbreaks ju...
9998     interesting trends in child immunization in ok...
9999     cdc says measles are at highest levels in deca...
10000    pneumonia vaccine for women w risk of pulmonar...
Name: safe_text, Length: 9658, dtype: object

In [27]:
train.isnull().sum()

tweet_id     0
safe_text    0
label        1
agreement    2
dtype: int64

In [28]:
train.describe()

Unnamed: 0,label,agreement
count,9657.0,9656.0
mean,0.301612,0.85446
std,0.65066,0.180632
min,-1.0,0.333333
25%,0.0,0.666667
50%,0.0,1.0
75%,1.0,1.0
max,1.0,1.0


In [29]:
#correlation between label and agreement
train['label'].corr(train['agreement'])

0.14141671170068723

In [30]:
train['agreement'].unique()

array([1.        , 0.66666667, 0.33333333,        nan])

#### Transformation

In [32]:
train.sample(10)

Unnamed: 0,tweet_id,safe_text,label,agreement
1136,BHFW704G,simple basic info immunization vaccines techno...,0.0,0.666667
7463,HVMCRWAQ,user ask question parents what would u do if u...,-1.0,1.0
2842,NI4B2XS6,these guys came into work today talking about ...,0.0,1.0
8931,Z0OBE6YV,user completely agree that being said this peo...,1.0,1.0
1156,91WTXZ3U,loudoun health district director provides us a...,0.0,1.0
5995,W8XVE7MZ,user user to the parent of the unvaccinated ch...,-1.0,0.333333
479,YPYVJ0OA,amnews hiv vaccine a reality by 2030 bill gate...,1.0,0.666667
9549,Z1NS0KXO,measles psssh that shit is played im on that r...,0.0,1.0
834,KHAKLPEY,user vaccinate your kids mike url,1.0,1.0
9837,NABL5RM1,malaria my ass that nurse vaccinated me to wat...,0.0,0.666667
