In [26]:
import re
import random
import json
import uuid
import pandas as pd

# Loading train.csv

In [27]:
import chardet

file_path = "train.csv" 

with open(file_path, "rb") as f:
    raw_data = f.read(100000) 
    result = chardet.detect(raw_data)

print(f"Detected Encoding: {result['encoding']}")

Detected Encoding: ISO-8859-1


In [28]:
file_path = "train.csv"  
df = pd.read_csv(file_path, usecols=['textID', 'text', 'sentiment'], encoding="latin1")

In [29]:
df.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [71]:
df['sentiment'].count()

27480

# Checking for duplicate values in textID and text column

In [30]:
duplicate_textID_count = df['textID'].duplicated().sum()
print(f"Number of duplicate textIDs: {duplicate_textID_count}")

Number of duplicate textIDs: 0


In [31]:
\duplicate_text_count = df['text'].duplicated().sum()
print(f"Number of duplicate text values: {duplicate_text_count}")

Number of duplicate text values: 0


# Checking for missing values in text column

In [32]:
missing_text_count = df['text'].isna().sum()
print(f"Number of missing values in 'text': {missing_text_count}")

empty_text_count = (df['text'].str.strip() == '').sum()
print(f"Number of empty text entries: {empty_text_count}")

Number of missing values in 'text': 1
Number of empty text entries: 0


In [33]:
missing_text_rows = df[df['text'].isna()]

print("Rows with missing text:")
print(missing_text_rows)

Rows with missing text:
         textID text sentiment
314  fdb77c3752  NaN   neutral


In [37]:
df = df.dropna(subset=['text'])

# Making sure that there are only 3 labels

In [39]:
df['sentiment'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [None]:
temp = df.drop(['textID'], axis=1)

In [None]:
temp.to_csv("train_vanilla.csv", encoding="latin1", index=False)

# Using cleanlab to check for wrong labels (using a simple logistic regression model)

In [40]:
# Convert sentiment labels to numerical values
sentiment_mapping = {"negative": 0, "neutral": 1, "positive": 2}
df['sentiment_label'] = df['sentiment'].map(sentiment_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_label'] = df['sentiment'].map(sentiment_mapping)


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])  
y = df['sentiment_label'].values         

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
import numpy as np

clf = LogisticRegression(max_iter=1000)
pred_probs = cross_val_predict(clf, X, y, cv=5, method="predict_proba") 

In [44]:
from cleanlab.filter import find_label_issues

# Identify potential label errors
label_issues = find_label_issues(y, pred_probs)

# Add a column indicating potential label issues
df['label_issue'] = label_issues

# Show samples with potential label errors
incorrect_labels = df[df['label_issue'] == True]

  from .autonotebook import tqdm as notebook_tqdm
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label_issue'] = label_issues


In [48]:
from cleanlab.classification import CleanLearning
clf = CleanLearning(LogisticRegression(max_iter=1000))
clf.fit(X, y)

label_issues = clf.get_label_issues()
print("Potential label issues:", label_issues)

Potential label issues:        is_label_issue  label_quality  given_label  predicted_label  \
0               False       0.689745            1                1   
1               False       0.931146            0                0   
2               False       0.441509            0                1   
3               False       0.456421            0                0   
4                True       0.314842            0                1   
...               ...            ...          ...              ...   
27475           False       0.547392            0                0   
27476            True       0.196894            0                1   
27477           False       0.951319            2                2   
27478           False       0.572459            2                2   
27479           False       0.422144            1                2   

       sample_weight  
0           1.448187  
1           1.246797  
2           1.246797  
3           1.246797  
4           0.000000

In [54]:
label_issues[label_issues["is_label_issue"] == True]

Unnamed: 0,is_label_issue,label_quality,given_label,predicted_label,sample_weight
4,True,0.314842,0,1,0.0
5,True,0.311513,1,2,0.0
12,True,0.214343,0,1,0.0
16,True,0.223929,0,1,0.0
17,True,0.193927,0,1,0.0
...,...,...,...,...,...
27462,True,0.379187,0,1,0.0
27469,True,0.150957,0,1,0.0
27471,True,0.179836,0,1,0.0
27473,True,0.199770,2,0,0.0


# Using a HF model to check for wrong labels

In [58]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
import numpy as np
from scipy.special import softmax

# Load the model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

def get_sentiment(text):
    
    encoded_input = tokenizer(text, return_tensors="pt")
    
    with torch.no_grad():
        output = model(**encoded_input)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    return config.id2label[ranking[0]].lower()

df['predicted_sentiment'] = df['text'].apply(get_sentiment)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [59]:
mislabeled_df = df[df['predicted_sentiment'] != df['sentiment']]

In [78]:
df.to_csv("train_comparison_result.csv", encoding="latin1", index=False)

In [87]:
mislabeled_df = mislabeled_df.drop(['sentiment_label', 'label_issue'], axis=1)

In [89]:
mislabeled_df.to_csv("mislabeled_sentiment.csv", encoding="latin1", index=False)
print("Mislabeled sentiment data saved as mislabeled_sentiment.csv!")

Mislabeled sentiment data saved as mislabeled_sentiment.csv!


In [99]:
temp = df.drop(['textID','sentiment_label', 'label_issue', 'sentiment'], axis=1)

In [102]:
temp = temp.rename(columns={"predicted_sentiment": "sentiment"})

In [103]:
temp.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [104]:
temp.to_csv("train_clean.csv", encoding="latin1", index=False)