# **English Text Preprocessing and Cleaning Pipeline**

### ***Importings***

In [48]:
from tqdm import tqdm
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from datasets import load_dataset
import google.generativeai as genai
from pandarallel import pandarallel

In [14]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fares\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fares\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fares\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### ***Cleaning***

1. **Loading and preparing the part to be used from the dataset (Where the subtask is "Bias") and nothing else label and english_mt columns**

In [28]:
ds = load_dataset("CAMeL-Lab/FIGNEWS-2024")

In [29]:
df = pd.concat(
    [split.to_pandas() for split in ds.values()],
    ignore_index=True
)

df.head()

Unnamed: 0,batch,source_language,id,type,sheet_type,team_name,annotator_id,subtask,label,text,arabic_mt,english_mt,notes
0,B01,English,1,MAIN,MAIN,SQUad,2,Bias,Biased against others,Yemen's Houthis have waded into the Israel-Ham...,خاض الحوثيون في اليمن الحرب بين إسرائيل وحماس ...,Yemen's Houthis have waded into the Israel-Ham...,
1,B01,English,2,MAIN,MAIN,SQUad,2,Bias,Unbiased,Isreal - Hamas Conflict | Face to Face,إسرائيل - الصراع مع حماس | وجها لوجه,Isreal - Hamas Conflict | Face to Face,
2,B01,English,4,MAIN,MAIN,SQUad,2,Bias,Biased against others,Videos show how armed men from Gaza stormed a ...,أظهرت مقاطع فيديو كيف اقتحم مسلحون من غزة مهرج...,Videos show how armed men from Gaza stormed a ...,
3,B01,English,7,MAIN,MAIN,SQUad,2,Bias,Unclear,Protest in Aligarh Muslim University in suppor...,وقفة احتجاجية في جامعة عليكرة الإسلامية دعما ل...,Protest in Aligarh Muslim University in suppor...,
4,B01,English,8,MAIN,MAIN,SQUad,2,Bias,Biased against both Palestine and Israel,IDF releases audio recording about misfired ro...,الجيش الإسرائيلي ينشر تسجيلًا صوتيًا حول صاروخ...,IDF releases audio recording about misfired ro...,


In [30]:
df = df[df["subtask"] == "Bias"]

df.head()

Unnamed: 0,batch,source_language,id,type,sheet_type,team_name,annotator_id,subtask,label,text,arabic_mt,english_mt,notes
0,B01,English,1,MAIN,MAIN,SQUad,2,Bias,Biased against others,Yemen's Houthis have waded into the Israel-Ham...,خاض الحوثيون في اليمن الحرب بين إسرائيل وحماس ...,Yemen's Houthis have waded into the Israel-Ham...,
1,B01,English,2,MAIN,MAIN,SQUad,2,Bias,Unbiased,Isreal - Hamas Conflict | Face to Face,إسرائيل - الصراع مع حماس | وجها لوجه,Isreal - Hamas Conflict | Face to Face,
2,B01,English,4,MAIN,MAIN,SQUad,2,Bias,Biased against others,Videos show how armed men from Gaza stormed a ...,أظهرت مقاطع فيديو كيف اقتحم مسلحون من غزة مهرج...,Videos show how armed men from Gaza stormed a ...,
3,B01,English,7,MAIN,MAIN,SQUad,2,Bias,Unclear,Protest in Aligarh Muslim University in suppor...,وقفة احتجاجية في جامعة عليكرة الإسلامية دعما ل...,Protest in Aligarh Muslim University in suppor...,
4,B01,English,8,MAIN,MAIN,SQUad,2,Bias,Biased against both Palestine and Israel,IDF releases audio recording about misfired ro...,الجيش الإسرائيلي ينشر تسجيلًا صوتيًا حول صاروخ...,IDF releases audio recording about misfired ro...,


In [31]:
df["English Text"] = df["english_mt"]
df["Bias"] = df["label"]

df = df[["English Text", "Bias"]]

df.head()

Unnamed: 0,English Text,Bias
0,Yemen's Houthis have waded into the Israel-Ham...,Biased against others
1,Isreal - Hamas Conflict | Face to Face,Unbiased
2,Videos show how armed men from Gaza stormed a ...,Biased against others
3,Protest in Aligarh Muslim University in suppor...,Unclear
4,IDF releases audio recording about misfired ro...,Biased against both Palestine and Israel


In [35]:
df['English Text'][0]

"Yemen's Houthis have waded into the Israel-Hamas war raging more than 1,000 miles from their seat of power in Sanaa, declaring they fired drones and missiles at Israel in attacks that highlight the regional risks of the conflict https://reut.rs/40jSi7A:=:https://www.reuters.com/world/middle-east/yemens-houthis-enter-mideast-fray-hardening-spillover-fears-2023-10-31/"

**2. Basic Analysis**

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88500 entries, 0 to 129799
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   English Text  88500 non-null  object
 1   Bias          88500 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [33]:
df.describe()

Unnamed: 0,English Text,Bias
count,88500,88500
unique,14456,7
top,BREAKING...,Unbiased
freq,142,37862


**3. Showing Class Imbalance**

In [34]:
df['Bias'].value_counts()

Bias
Unbiased                                    37862
Biased against Palestine                    25810
Biased against Israel                        9679
Biased against others                        5638
Unclear                                      5624
Not Applicable                               2481
Biased against both Palestine and Israel     1406
Name: count, dtype: int64

**4. Preprocessing**

In [None]:
def preprocess_text(text):
    text = text.lower()
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^a-z\s]', '', text)
   
    return text

df['English Text'] = df['English Text'].apply(preprocess_text)

print(df['English Text'])

0         Yemen's Houthis have waded into the Israel-Ham...
1                    Isreal - Hamas Conflict | Face to Face
2         Videos show how armed men from Gaza stormed a ...
3         Protest in Aligarh Muslim University in suppor...
4         IDF releases audio recording about misfired ro...
                                ...                        
129795    Gaza War: Israel said Hamas is disintegrating,...
129796    The power of 10 kg 'SEMTEX'... Murder of Hamas...
129797    Once again Hamas showed strength in Gaza, atta...
129798    Hamas Israel War: More than 20 Israeli soldier...
129799    The entire area is full of smoke after the Isr...
Name: English Text, Length: 88500, dtype: object


**5. Normalizing Text**

In [38]:
def normalize_text(text):
    text = text.replace('\n', ' ')
    text = text.lower()
    return text

df['Normalized Text'] = df['English Text'].apply(normalize_text)

print(df['Normalized Text'])

0         yemens houthis have waded into the israelhamas...
1                      isreal  hamas conflict  face to face
2         videos show how armed men from gaza stormed a ...
3         protest in aligarh muslim university in suppor...
4         idf releases audio recording about misfired ro...
                                ...                        
129795    gaza war israel said hamas is disintegrating f...
129796    the power of  kg semtex murder of hamas number...
129797    once again hamas showed strength in gaza attac...
129798    hamas israel war more than  israeli soldiers k...
129799    the entire area is full of smoke after the isr...
Name: Normalized Text, Length: 88500, dtype: object


**6. Applying Regex**

In [41]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize_text(text):
    return tokenizer.tokenize(text)

df['Tokenized Text'] = df['Normalized Text'].apply(tokenize_text)

print(df['Tokenized Text'])

0         [yemens, houthis, have, waded, into, the, isra...
1                 [isreal, hamas, conflict, face, to, face]
2         [videos, show, how, armed, men, from, gaza, st...
3         [protest, in, aligarh, muslim, university, in,...
4         [idf, releases, audio, recording, about, misfi...
                                ...                        
129795    [gaza, war, israel, said, hamas, is, disintegr...
129796    [the, power, of, kg, semtex, murder, of, hamas...
129797    [once, again, hamas, showed, strength, in, gaz...
129798    [hamas, israel, war, more, than, israeli, sold...
129799    [the, entire, area, is, full, of, smoke, after...
Name: Tokenized Text, Length: 88500, dtype: object


**7. Removing Stop Words**

In [42]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [w for w in tokens if w not in stop_words]

df['No Stopwords'] = df['Tokenized Text'].apply(remove_stopwords)

print(df['No Stopwords'])

0         [yemens, houthis, waded, israelhamas, war, rag...
1                     [isreal, hamas, conflict, face, face]
2         [videos, show, armed, men, gaza, stormed, musi...
3         [protest, aligarh, muslim, university, support...
4         [idf, releases, audio, recording, misfired, ro...
                                ...                        
129795    [gaza, war, israel, said, hamas, disintegratin...
129796    [power, kg, semtex, murder, hamas, number, sup...
129797    [hamas, showed, strength, gaza, attacked, idf,...
129798    [hamas, israel, war, israeli, soldiers, killed...
129799    [entire, area, full, smoke, israeli, attack, w...
Name: No Stopwords, Length: 88500, dtype: object


**8. Applying Stemming**

In [47]:
ps = PorterStemmer()

def stem_tokens(tokens):
    return [ps.stem(w) for w in tokens]

df['Stemmed Text'] = df['No Stopwords'].apply(stem_tokens)

print(df['Stemmed Text'])

0         [yemen, houthi, wade, israelhama, war, rage, m...
1                      [isreal, hama, conflict, face, face]
2         [video, show, arm, men, gaza, storm, music, fe...
3         [protest, aligarh, muslim, univers, support, h...
4         [idf, releas, audio, record, misfir, rocket, c...
                                ...                        
129795    [gaza, war, israel, said, hama, disintegr, fie...
129796    [power, kg, semtex, murder, hama, number, supe...
129797    [hama, show, strength, gaza, attack, idf, armo...
129798    [hama, israel, war, isra, soldier, kill, gaza,...
129799    [entir, area, full, smoke, isra, attack, warco...
Name: Stemmed Text, Length: 88500, dtype: object


**9. Applying Lemmatization**

In [49]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]

df['Lemmatized Text'] = df['No Stopwords'].apply(lemmatize_tokens)

print(df['Lemmatized Text'])

0         [yemen, houthis, waded, israelhamas, war, ragi...
1                     [isreal, hamas, conflict, face, face]
2         [video, show, armed, men, gaza, stormed, music...
3         [protest, aligarh, muslim, university, support...
4         [idf, release, audio, recording, misfired, roc...
                                ...                        
129795    [gaza, war, israel, said, hamas, disintegratin...
129796    [power, kg, semtex, murder, hamas, number, sup...
129797    [hamas, showed, strength, gaza, attacked, idf,...
129798    [hamas, israel, war, israeli, soldier, killed,...
129799    [entire, area, full, smoke, israeli, attack, w...
Name: Lemmatized Text, Length: 88500, dtype: object


In [50]:
df.head()

Unnamed: 0,English Text,Bias,Normalized Text,Tokenized Text,No Stopwords,Stemmed Text,Lemmatized Text
0,yemens houthis have waded into the israelhamas...,Biased against others,yemens houthis have waded into the israelhamas...,"[yemens, houthis, have, waded, into, the, isra...","[yemens, houthis, waded, israelhamas, war, rag...","[yemen, houthi, wade, israelhama, war, rage, m...","[yemen, houthis, waded, israelhamas, war, ragi..."
1,isreal hamas conflict face to face,Unbiased,isreal hamas conflict face to face,"[isreal, hamas, conflict, face, to, face]","[isreal, hamas, conflict, face, face]","[isreal, hama, conflict, face, face]","[isreal, hamas, conflict, face, face]"
2,videos show how armed men from gaza stormed a ...,Biased against others,videos show how armed men from gaza stormed a ...,"[videos, show, how, armed, men, from, gaza, st...","[videos, show, armed, men, gaza, stormed, musi...","[video, show, arm, men, gaza, storm, music, fe...","[video, show, armed, men, gaza, stormed, music..."
3,protest in aligarh muslim university in suppor...,Unclear,protest in aligarh muslim university in suppor...,"[protest, in, aligarh, muslim, university, in,...","[protest, aligarh, muslim, university, support...","[protest, aligarh, muslim, univers, support, h...","[protest, aligarh, muslim, university, support..."
4,idf releases audio recording about misfired ro...,Biased against both Palestine and Israel,idf releases audio recording about misfired ro...,"[idf, releases, audio, recording, about, misfi...","[idf, releases, audio, recording, misfired, ro...","[idf, releas, audio, record, misfir, rocket, c...","[idf, release, audio, recording, misfired, roc..."


### ***Extracting Text Embeddings***

In [46]:
pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [None]:
def get_embedding(text):
    import google.generativeai as genai
    genai.configure(api_key="AIzaSyCIih7gyuIqbgFfFRw6asj1x-8pzQeOmxk")
    response = genai.embed_content(
        model="text-embedding-004",
        content=text
    )
    return response["embedding"]

In [None]:
df["Text Embedding"] = df["English Text"].parallel_apply(get_embedding)

df.to_pickle("dataset_with_embeddings.pkl")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11063), Label(value='0 / 11063')))…

In [None]:
df_embeddings = pd.read_pickle("dataset_with_embeddings.pkl")

df_embeddings.head()

Unnamed: 0,English Text,Bias,Text Embedding
0,Yemen's Houthis have waded into the Israel-Ham...,Biased against others,"[0.04340715, 0.033973224, -0.056390334, -0.075..."
1,Isreal - Hamas Conflict | Face to Face,Unbiased,"[-0.026853735, -0.0381826, -0.026799945, -0.04..."
2,Videos show how armed men from Gaza stormed a ...,Biased against others,"[0.032202143, 0.008875384, 0.007970843, -0.033..."
3,Protest in Aligarh Muslim University in suppor...,Unclear,"[-0.046016745, -0.03402762, -0.075399436, -0.0..."
4,IDF releases audio recording about misfired ro...,Biased against both Palestine and Israel,"[0.05440593, -0.021562053, -0.069958545, -0.07..."
