In [4]:


import pandas as pd
import html
import re

FILE_PATH = "dataset/sentiment140/training.1600000.processed.noemoticon.csv"
column_names= ['label', 'id', 'date', 'query', 'user', 'raw_text']

neutral_column_names = ['id', 'entity', 'sentiment', 'raw_text']
NEUTRAL_FILE_PATH = "dataset/twitter-sentiment-neutral/twitter_training.csv"


# Load the Sentiment140 dataset - df = dataframe
df = pd.read_csv(
  FILE_PATH, 
  encoding='ISO-8859-1', #bc sentiment130 is an old dataset with special characters
  header=None, #doesn't have a header
  names=column_names #because the header doesn't exist
)

df_neutral_raw = pd.read_csv(
    NEUTRAL_FILE_PATH,
    encoding='utf-8',  
    header=None,
    names=neutral_column_names
)

#filter only rows that contain neutral or irrelevant
df_neutral = df_neutral_raw[
    (df_neutral_raw['sentiment'] == 'Neutral') | 
    (df_neutral_raw['sentiment'] == 'Irrelevant')
].copy()

# Select only necessary columns
df = df[['label', 'raw_text']].copy()
df_neutral = df_neutral[['sentiment', 'raw_text']].copy()

#renaming columns for consistency
df_neutral.rename(columns={'sentiment': 'label'}, inplace=True)
df_neutral['label'] = 1

print(f"\nSentiment140 loaded. Rows: {len(df)}")
print("\nFirst 5 rows for inspection:\n", df.head())

print(f"\nTwitter-sentiment-neutral loaded. Rows: {len(df_neutral)}")
print("\nFirst 5 rows for inspection:\n", df_neutral.head())


Sentiment140 loaded. Rows: 1600000

First 5 rows for inspection:
    label                                           raw_text
0      0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1      0  is upset that he can't update his Facebook by ...
2      0  @Kenichan I dived many times for the ball. Man...
3      0    my whole body feels itchy and like its on fire 
4      0  @nationwideclass no, it's not behaving at all....

Twitter-sentiment-neutral loaded. Rows: 31308

First 5 rows for inspection:
     label                                           raw_text
12      1  Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM...
13      1  Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM...
14      1  Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM...
15      1  Rock-Hard La Vita, RARE BUT POWERFUL, HANDSOME...
16      1  Live Rock - Hard music La la Varlope, RARE & t...


In [5]:
def clean_sentiment140_text(text):
  if not isinstance(text, str):
    return ""
  
  #lowercasting
  text = html.unescape(text.lower())

  #removes mentions '@'
  text = re.sub(r'@[A-Za-z0-9_]+', ' ', text)

  #removes url patterns
  url_pattern = r'https?://\S+|www\.\S+|\S+\.com\S*'
  text = re.sub(url_pattern, '', text)

  #removes spaces
  text = re.sub(r'\s+', ' ', text).strip()

  return text


In [6]:
def clean_twitter_sentiment_text(text):
  if not isinstance(text, str):
      return "" 
  
  # Lowercasing & decoding HTML
  text = html.unescape(text.lower())

  # removes unknown and !! => noise
  text = text.replace('<unk>', ' ').replace('‼', ' ')
  
  text = text.replace('"', '').replace('“', '').replace('”', '')
  
  #removes mentions '@'
  text = re.sub(r'@[A-Za-z0-9_]+', ' ', text)
  
  url_pattern = r'https?://\S+|www\.\S+|[a-zA-Z0-9]+\.[a-z]+\S*'
  text = re.sub(url_pattern, ' ', text)
  
  text = re.sub(r'\s+', ' ', text).strip()
  
  return text

In [7]:
print('Cleanup function for sentiment140 and twitter neutral messages')
df['clean_text'] = df['raw_text'].apply(clean_sentiment140_text)
df_neutral['clean_text'] = df_neutral['raw_text'].apply(clean_twitter_sentiment_text)

#changes 4 to 2 dor Positives
df['label'] = df['label'].replace(4, 2)

#reduction of the dataset to 500K for fast prelimination training
N_SUBSAMPLE = 500_000
if len(df) > N_SUBSAMPLE:
  print(f"Subsampling at {N_SUBSAMPLE} rows...")
  df = df.sample(n=N_SUBSAMPLE, random_state=42)


#keeping only necessary columns
df_final = df[['label', 'clean_text']].copy()
df_neutral_final = df_neutral[['label', 'clean_text']].copy()

#concatenating sentiment140 with twitter messages
df_final = pd.concat([df_final, df_neutral_final], ignore_index=True)

#Final verification
print("\nFinal verification after cleanup and subsampling:")
print("Label distribution: \n", df_final['label'].value_counts().sort_index()) # Sortam pentru 0, 1, 2
print("\nExample of final clean text (Combined Dataset):")
print(df_final.sample(10, random_state=42))

Cleanup function for sentiment140 and twitter neutral messages
Subsampling at 500000 rows...

Final verification after cleanup and subsampling:
Label distribution: 
 label
0    249375
1     31308
2    250625
Name: count, dtype: int64

Example of final clean text (Combined Dataset):
        label                                         clean_text
29380       2  morning everyone, or afternoon or good evening...
195844      2  @ hey, i see interesting musical tastes. thank...
253761      0  really wish i didn't get 8.30am phone calls ev...
209650      0  oops haha. delayed i am. i think mine cant hol...
475480      2                                        you're sexy
132117      2                                          cool pics
393225      0  is impressed by night at the museum 2 (: but i...
390997      2                    woody says i have a magic smile
120192      2         i am also good thank you up to much today?
303052      0             yeah i nearly broke my ankle yesterday
