# Preprocessing and Traditional Language Representation for Text Classification using Scikit-Learn

 This notebook explores various text representation techniques for Natural Language Processing and Information Retrieval tasks. It preprocesses a dataset of tweets, implements different vectorization methods such as:


*   Bag of Words (binary and term-frequency)
*   TF-IDF
*   Bigram and trigram models
*   Latent Semantic Analysis (LSA) with Singular Value Decomposition (SVD)


 The notebook then applies cosine similarity to analyze the similarity between sexist and non-sexist tweets. All implementations use Scikit-Learn's text processing tools.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Some help, if you need it

In [None]:
!pip install pandas
!pip install nltk
!pip install scikit-learn

import nltk
nltk.download('gutenberg')
# from nltk.book import texts
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import csv
import re



[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


## Load de dataset

In [None]:
filename = "./drive/MyDrive/EXIST2024_EN_examples.csv"
# filename = "EXIST2024_EN_examples.csv"

class CSVReader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = []

    def read_csv(self):
        with open(self.file_path, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file, delimiter='\t')
            for row in reader:
                self.data.append({
                    "id": int(row["id"]),
                    "text": row["text"],
                    "is_sexist": row["label"].strip().upper() == "YES",
                    "size": int(row["size"])
                })

    def get_data(self):
        return self.data

reader = CSVReader(filename)
reader.read_csv()

## Preprocess dataset

In [None]:
web_re = re.compile(r"https?:\/\/[^\s]+", re.U)

def preprocess(text):
    text = web_re.sub("", text)
    text = text.lower()
    return text

for tweet in reader.get_data():
    tweet["text"] = preprocess(tweet["text"])

In [None]:
def compute_similarity(MyCorpus, bow, sexist):
    cosine_sim = cosine_similarity(bow)
    maxi = 0
    max_ind = 0, 0

    for x in range(len(cosine_sim)):
        for y in range(len(cosine_sim[x])):
            if x != y:
                if cosine_sim[x][y] > maxi:
                    maxi = cosine_sim[x][y]
                    max_ind = x, y

    tweet1, tweet2 = MyCorpus[max_ind[0]], MyCorpus[max_ind[1]]
    label = "YES" if sexist else "NO"
    print(f"label: {label}")
    print(f"1: {tweet1}\n2: {tweet2}")
    print(f"Cosine Similarity: {cosine_sim[max_ind[0]][max_ind[1]]:.4f}")

In [None]:
print("Bag of Words (binary)\n=====================\n")
MyCorpus_sexist = [tweet["text"] for tweet in reader.get_data() if tweet["is_sexist"]]
MyCorpus_nonsexist = [tweet["text"] for tweet in reader.get_data() if not tweet["is_sexist"]]

vectorizer_bin = CountVectorizer(binary= True)
vectorizer_bin.fit(MyCorpus_nonsexist)
X_bag_of_words_bin = vectorizer_bin.transform(MyCorpus_nonsexist)

compute_similarity(MyCorpus_nonsexist, X_bag_of_words_bin, False)
print("--------------------------------")

vectorizer_bin = CountVectorizer(binary= True)
vectorizer_bin.fit(MyCorpus_sexist)
X_bag_of_words_bin = vectorizer_bin.transform(MyCorpus_sexist)

compute_similarity(MyCorpus_sexist, X_bag_of_words_bin, True)

Bag of Words (binary)

label: NO
1: ♫ now playing : treat her like a lady (single version) by the temptations  
2: now playing:  the temptations - treat her like a lady   listen live: 
Cosine Similarity: 0.7628
--------------------------------
label: YES
1: @yayroger @victoriarossi @ionaguyf @metsdaddy2013 @byandrewwagner @themikebpeters dr. cox, does this shade of red make me look like a clown?no barbie. it makes you look like a prostitute who caters exclusively to clowns.
2: sharon: oh, loki, does this lipstick make me look like a clown?loki: no, barbie, no... it makes you look like a prostitute who caters exclusively *to* clowns.
Cosine Similarity: 0.7126


In [None]:
print("Bag of Words (Term-Frequency) without normalization\n=====================\n")
vectorizer_freq = CountVectorizer(binary= False)
vectorizer_freq.fit(MyCorpus_nonsexist)
X_bag_of_words_freq = vectorizer_freq.transform(MyCorpus_nonsexist)

compute_similarity(MyCorpus_nonsexist, X_bag_of_words_freq, False)

print("--------------------------------")

vectorizer_freq = CountVectorizer(binary= False)
vectorizer_freq.fit(MyCorpus_sexist)
X_bag_of_words_freq = vectorizer_freq.transform(MyCorpus_sexist)

compute_similarity(MyCorpus_sexist, X_bag_of_words_freq, True)

Bag of Words (Term-Frequency) without normalization

label: NO
1: @bleedthisway replay free woman breebylon &gt;&gt;&gt; flop this way
2: replay&gt;alice&gt;babylon&gt;free woman 
Cosine Similarity: 0.7778
--------------------------------
label: YES
1: @yayroger @victoriarossi @ionaguyf @metsdaddy2013 @byandrewwagner @themikebpeters dr. cox, does this shade of red make me look like a clown?no barbie. it makes you look like a prostitute who caters exclusively to clowns.
2: sharon: oh, loki, does this lipstick make me look like a clown?loki: no, barbie, no... it makes you look like a prostitute who caters exclusively *to* clowns.
Cosine Similarity: 0.7247


In [None]:
def compute_similarity2(MyCorpus, bow, sexist, representation):
    cosine_sim = cosine_similarity(bow)
    maxi = 0
    max_ind = (0, 0)

    for x in range(len(cosine_sim)):
        for y in range(len(cosine_sim[x])):

            if x != y and cosine_sim[x][y] > maxi:
                maxi = cosine_sim[x][y]
                max_ind = (x, y)

    tweet1, tweet2 = MyCorpus[max_ind[0]], MyCorpus[max_ind[1]]
    label = "YES" if sexist else "NO"
    print(f"label: {label}")
    print(f"1: {tweet1}\n2: {tweet2}")
    print(f"Cosine Similarity: {cosine_sim[max_ind[0]][max_ind[1]]:.4f}")

In [None]:
bigram_vectorizer = CountVectorizer(analyzer='word',
                        ngram_range=(2,2),binary=False, stop_words = None, preprocessor=None)

print("Bigrams of Words (Term-Frequency) without normalization\n=====================\n")
counts = bigram_vectorizer.fit_transform(MyCorpus_nonsexist)
compute_similarity2(MyCorpus_nonsexist, counts, False, "bigrams representation")

print("--------------------------------")

counts = bigram_vectorizer.fit_transform(MyCorpus_sexist)
compute_similarity2(MyCorpus_sexist, counts, True, "bigrams representation")


Bigrams of Words (Term-Frequency) without normalization

label: NO
1: earth angel (androgynous mind) 
2: average hacker fan vs average earth angel (androgynous mind) enthusiast
Cosine Similarity: 0.5774
--------------------------------
label: YES
1: @yayroger @victoriarossi @ionaguyf @metsdaddy2013 @byandrewwagner @themikebpeters dr. cox, does this shade of red make me look like a clown?no barbie. it makes you look like a prostitute who caters exclusively to clowns.
2: sharon: oh, loki, does this lipstick make me look like a clown?loki: no, barbie, no... it makes you look like a prostitute who caters exclusively *to* clowns.
Cosine Similarity: 0.6124


In [None]:
trigram_vectorizer = CountVectorizer(analyzer='char_wb',
                        ngram_range=(3,3),binary = False, stop_words = None)

print("Trigram of Words (Term-Frequency) without normalization\n=====================\n")

counts = trigram_vectorizer.fit_transform(MyCorpus_nonsexist)
compute_similarity2(MyCorpus_nonsexist, counts, False, "trigrams representation")
print("--------------------------------")

counts = trigram_vectorizer.fit_transform(MyCorpus_sexist)
compute_similarity2(MyCorpus_sexist, counts, True, "trigrams representation")

Trigram of Words (Term-Frequency) without normalization

label: NO
1: ♫ now playing : treat her like a lady (single version) by the temptations  
2: now playing:  the temptations - treat her like a lady   listen live: 
Cosine Similarity: 0.7100
--------------------------------
label: YES
1: @yayroger @victoriarossi @ionaguyf @metsdaddy2013 @byandrewwagner @themikebpeters dr. cox, does this shade of red make me look like a clown?no barbie. it makes you look like a prostitute who caters exclusively to clowns.
2: sharon: oh, loki, does this lipstick make me look like a clown?loki: no, barbie, no... it makes you look like a prostitute who caters exclusively *to* clowns.
Cosine Similarity: 0.6660


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=None, binary=False, use_idf=True, preprocessor=None)

print("TF-IDF based on words with 'l2' normalization \n=====================\n")
counts = tfidf_vectorizer.fit_transform(MyCorpus_nonsexist)
compute_similarity2(MyCorpus_nonsexist, counts, False, "")

print("--------------------------------")
counts = tfidf_vectorizer.fit_transform(MyCorpus_sexist)
compute_similarity2(MyCorpus_sexist, counts, True, "")


TF-IDF based on words with 'l2' normalization 

label: NO
1: @bleedthisway replay free woman breebylon &gt;&gt;&gt; flop this way
2: replay&gt;alice&gt;babylon&gt;free woman 
Cosine Similarity: 0.7384
--------------------------------
label: YES
1: @yayroger @victoriarossi @ionaguyf @metsdaddy2013 @byandrewwagner @themikebpeters dr. cox, does this shade of red make me look like a clown?no barbie. it makes you look like a prostitute who caters exclusively to clowns.
2: sharon: oh, loki, does this lipstick make me look like a clown?loki: no, barbie, no... it makes you look like a prostitute who caters exclusively *to* clowns.
Cosine Similarity: 0.5112


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

print("LSA based on TF-IDF of words (50 singular values)\n=====================\n")


svd = TruncatedSVD(n_components=50)
tfidfMatrix_nonsexist = tfidf_vectorizer.fit_transform(MyCorpus_nonsexist)
svdMatrix_nonsexist = svd.fit_transform(tfidfMatrix_nonsexist)
compute_similarity2(MyCorpus_nonsexist, svdMatrix_nonsexist, False, "LSA for 50 representation")

print("--------------------------------")

tfidfMatrix_sexist = tfidf_vectorizer.fit_transform(MyCorpus_sexist)
svdMatrix_sexist = svd.fit_transform(tfidfMatrix_sexist)
compute_similarity2(MyCorpus_sexist, svdMatrix_sexist, True, "LSA for 50 representation")


LSA based on TF-IDF of words (50 singular values)

label: NO
1: @mainetrendteam @mainedcm androgynous (mind).mainexlazada 1111supershow#lazadaxmaine @mainedcm #maineforlazada1111 #mainemendoza @mainedcm
2: earth angel (androgynous mind) 
Cosine Similarity: 0.9990
--------------------------------
label: YES
1: @twinklybunny perfect young body. i would like to fuck you.
2: @misskatie2021 i would like to fuck you 😂😂😂à french fan
Cosine Similarity: 0.9859


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

print("LSA based on TF-IDF of words (100 singular values)\n=====================\n")

svd = TruncatedSVD(n_components=100)
tfidfMatrix_nonsexist = tfidf_vectorizer.fit_transform(MyCorpus_nonsexist)
svdMatrix_nonsexist = svd.fit_transform(tfidfMatrix_nonsexist)

compute_similarity2(MyCorpus_nonsexist, svdMatrix_nonsexist, False, "LSA for 100 representation")
print("--------------------------------")

tfidfMatrix_sexist = tfidf_vectorizer.fit_transform(MyCorpus_sexist)
svdMatrix_sexist = svd.fit_transform(tfidfMatrix_sexist)

compute_similarity2(MyCorpus_sexist, svdMatrix_sexist, True, "LSA for 100 representation")


LSA based on TF-IDF of words (100 singular values)

label: NO
1: @mainetrendteam @mainedcm androgynous (mind).mainexlazada 1111supershow#lazadaxmaine @mainedcm #maineforlazada1111 #mainemendoza @mainedcm
2: earth angel (androgynous mind) 
Cosine Similarity: 0.9985
--------------------------------
label: YES
1: @yayroger @victoriarossi @ionaguyf @metsdaddy2013 @byandrewwagner @themikebpeters dr. cox, does this shade of red make me look like a clown?no barbie. it makes you look like a prostitute who caters exclusively to clowns.
2: sharon: oh, loki, does this lipstick make me look like a clown?loki: no, barbie, no... it makes you look like a prostitute who caters exclusively *to* clowns.
Cosine Similarity: 0.9745
