In [1]:
import os
import abc
import gensim

In [2]:
class Utils(metaclass=abc.ABCMeta):
    def __init__(self) -> None:
        ()

    def create_folder(self, folder_path: str) -> None:
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

In [3]:
class BagOfWordsClassifier():
    def __init__(self):
        self.counts = {}
        self.counts_PCL = {}
        self.counts_not_PCL = {}
        self.PCL_word_count = 0
        self.no_PCL_word_count = 0
        self.PCL_document_count = 0
        self.no_PCL_document_count = 0

    def clean_text_tokenize(self, text):
        stop_words = ['this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there',
                      'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'don', "don't", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'shan', "shan't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

        text.lower()
        # removing " at start of sentences
        text = text.strip("\"")
        # replacing repetitions of punctations
        text = re.sub(r'\"+', '', text)

        # Tokenize links
        text = re.sub(r'https? : \S+', '[WEBSITE]', text)
        # removing referencing on usernames with @
        text = re.sub(r'@\S+', '', text)
        # removing smileys with : (like :),:D,:( etc)
        text = re.sub(r':\S+', '', text)
        # Remove punctation
        text = re.sub(r"[!.,;:?\'\"\´]", "", text)
        text = re.sub('(?<![\w])20[0-5][0-9]-?[0-9]*',
                    '[YEAR]', text)              # Year token
        text = re.sub('(?<![\w])1[0-9]{3}-?[0-9]*',
                    '[YEAR]', text)                 # Year token
        # replacing numbers with [NUM] tag  eg 1,000, 1.32, 5-7. Assert these numbers are not inside words (i.e. H1, )
        text = re.sub('(?<![\w])[0-9]+[.,]?[0-9]*(?![\w])', '[NUM]', text)
        text = re.sub('\[NUM\]-\[NUM\]', '[NUM]', text)
        # Again to delete account numbers lol 12-5223-231
        text = re.sub('\[NUM\]-\[NUM\]', '[NUM]', text)
        text = re.sub('(?<=\[NUM\])-(?=[a-zA-Z])', ' ', text)
        text = re.sub('[ ]*', ' ', text)
        text = re.sub('<h>', '.', text)

        porter = PorterStemmer()
        words = text.split()
        for i, word in enumerate(words):
            if word in stop_words:
                words.pop(i)
            else:
                words[i] = porter.stem(word)
        return words

    def train(self, train_DF):
        for i, row in train_DF.iterrows():
            text = row["text"]
            label = row["binary_label"]

            if label == 0:
                self.no_PCL_document_count += 1
            else:
                self.PCL_document_count += 1

            words = self.clean_text_tokenize(text)
            for word in words:
                self.counts[word] = 1 + \
                    (self.counts[word] if word in self.counts.keys() else 0)

                if label == 0:
                    self.no_PCL_word_count += 1
                    self.counts_not_PCL[word] = 1 + \
                        (self.counts_not_PCL[word]
                         if word in self.counts_not_PCL.keys() else 0)
                else:
                    self.PCL_word_count += 1
                    self.counts_PCL[word] = 1 + \
                        (self.counts_PCL[word]
                         if word in self.counts_PCL.keys() else 0)
                    
    def predict(self, sentences):

        prior = self.PCL_document_count / \
            (self.PCL_document_count + self.no_PCL_document_count)
        epsilon = 1  # epsilon smoothing
        if type(sentences) is str:
            sentences = [sentences]
        if type(sentences) is pd.DataFrame:
            i, sentences = sentences.iterrows()

        predictions = []
        for sentence in sentences:

            likelihood = 1
            for word in sentence:
                class_count = self.counts_PCL[word] if word in self.counts_PCL.keys(
                ) else 0
                likelihood *= (class_count+epsilon) / \
                    (len(self.counts) + self.PCL_word_count)

            prob_PCL = prior*likelihood

            likelihood = 1
            for word in sentence:
                class_count = self.counts_not_PCL[word] if word in self.counts_not_PCL.keys(
                ) else 0
                likelihood *= (class_count+epsilon) / \
                    (len(self.counts) + self.no_PCL_word_count)

            prob_not_PCL = (1-prior)*likelihood

            predictions.append(1 if prob_PCL > prob_not_PCL else 0)

        return predictions

In [4]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv('./data/dontpatronizeme_pcl.csv')

df = df.dropna()

train_id = pd.read_csv("./data/train_semeval_parids-labels.csv")
dev_id = pd.read_csv("./data/dev_semeval_parids-labels.csv")


In [5]:
data_pcl_train = df[df["par_id"].isin(train_id["par_id"].tolist())]
data_pcl_train

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label
0,1,24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0.0,0.0
1,2,21968160,migrant,gh,"In Libya today , there are countless number of...",0.0,0.0
2,3,16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0.0,0.0
3,4,7811231,disabled,nz,Council customers only signs would be displaye...,0.0,0.0
4,5,1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0.0,0.0
...,...,...,...,...,...,...,...
10407,10408,4542224,homeless,my,""""""" Most of them ( the homeless ) have the abi...",3.0,1.0
10423,10424,4665292,women,jm,""""""" I do n't believe in abortion , I think it ...",3.0,1.0
10444,10445,3923193,refugee,gb,More than 150 volunteers spent the night in ' ...,3.0,1.0
10453,10454,22338535,vulnerable,ie,""""""" We are challenged , I suggest , to turn th...",4.0,1.0


In [6]:
path = "./data/GoogleNews-vectors-negative300.bin.gz"
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

In [7]:
def word_emedding_average(text):
    words = text.split()
    embeddings = []
    for word in words:
        if word in model.key_to_index:
            vector = model[word]
            embeddings.append(vector)
    return np.mean(np.array(embeddings), axis=0)

In [None]:
#建了一个新的列 length_text，其内容是 data_pcl_train 中每个 text 列中文本的长度
data_pcl_train["length_text"] = data_pcl_train["text"].apply(lambda x: len(x))

# 创建了一个新的列 word_embedding，通过应用 word_emedding_average 函数于 text 列的每个元素上。
# 这个函数的作用是将每个文本转换为一个词嵌入向量（word embedding），通常是通过计算文本中所有词向量的平均值来获得
data_pcl_train["word_embedding"] = data_pcl_train["text"].apply(lambda x: word_emedding_average(x))

# # 将 word_embedding 列中的每个嵌入向量展开成 300 个独立的列（假设每个词嵌入向量的维度为 300
data_pcl_train[[i for i in range(300)]] = data_pcl_train['word_embedding'].apply(lambda x: pd.Series(x))

# 移除了 data_pcl_train 中不再需要的列，包括原始的 word_embedding 列（因为其内容已经被展开到新的列中了），以及 par_id、art_id、text 和 label 这几列
data_pcl_train.drop(['word_embedding','par_id','art_id','text','label'], axis=1, inplace=True)

# 将 country_code 和 keyword 列中的文本类别转换为数值代码
data_pcl_train["country_code"] = pd.Categorical(data_pcl_train["country_code"], categories=data_pcl_train["country_code"].unique()).codes
data_pcl_train["keyword"] = pd.Categorical(data_pcl_train["keyword"], categories=data_pcl_train["keyword"].unique()).codes

data_pcl_train

In [None]:
from sklearn.linear_model import LogisticRegression

Y_train = data_pcl_train["binary_label"].astype("float").to_numpy()
X_train = data_pcl_train.drop(['binary_label'], axis=1, inplace=False).astype("float").to_numpy()

logistic_regression = LogisticRegression(random_state=0).fit(X_train, Y_train)

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

# 分离特征和标签
Y_train = data_pcl_train["binary_label"].astype("float").to_numpy()
X_train = data_pcl_train.drop(['binary_label'], axis=1, inplace=False).astype("float").to_numpy()

svm_pipeline = make_pipeline(StandardScaler(), SVC(random_state=42))
svm_pipeline.fit(X_train, Y_train)



In [11]:
def word_embedding_average_modified(text):
    if isinstance(text, str):
        # 如果 text 是字符串，执行原来的逻辑
        return word_emedding_average(text)
        print(aaaa)
    else:
        # 如果 text 不是字符串（比如 NaN），返回一个全零的向量
        return [0] * 300  # 假设词嵌入向量的维度为 300

In [12]:
data_pcl_dev = df[df["par_id"].isin(dev_id["par_id"].tolist())]

In [13]:
data_pcl_dev

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label
106,107,16900972,homeless,ke,"""His present """" chambers """" may be quite humbl...",3.0,1.0
148,149,1387882,disabled,us,Krueger recently harnessed that creativity to ...,2.0,1.0
150,151,19974860,poor-families,in,10:41am - Parents of children who died must ge...,3.0,1.0
153,154,20663936,disabled,ng,When some people feel causing problem for some...,4.0,1.0
156,157,21712008,poor-families,ca,We are alarmed to learn of your recently circu...,4.0,1.0
...,...,...,...,...,...,...,...
10462,10463,4676355,refugee,pk,""""""" The Pakistani police came to our house and...",0.0,0.0
10463,10464,19612634,disabled,ie,"""When Marie O'Donoghue went looking for a spec...",0.0,0.0
10464,10465,14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",1.0,0.0
10465,10466,70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0.0,0.0


In [None]:
# 为 data_pcl_dev 中的每个文本条目计算字符长度，并将这些长度值存储在新的列 length_text 中
# data_pcl_dev["length_text"] = data_pcl_dev["text"].apply(lambda x: len(x))
data_pcl_dev["length_text"] = data_pcl_dev["text"].apply(lambda x: len(x) if isinstance(x, str) else 0)

# ，对每个文本条目应用 word_emedding_average 函数，生成一个词嵌入向量，并将这些向量存储在新的列 word_embedding 中
data_pcl_dev["word_embedding"] = data_pcl_dev["text"].apply(lambda x: word_embedding_average_modified(x))

# 将 word_embedding 列中的向量展开到 300 个独立的列中（假设每个词嵌入向量的维度为 300）
data_pcl_dev[[i for i in range(300)]] = data_pcl_dev['word_embedding'].apply(lambda x: pd.Series(x))

#  删除了 word_embedding（因为它的内容已经被展开到新的列中了）、par_id、art_id、text 和 label 这些列
data_pcl_dev.drop(['word_embedding','par_id','art_id','text','label'], axis=1, inplace=True)
data_pcl_dev["country_code"] = pd.Categorical(data_pcl_dev["country_code"], categories=data_pcl_dev["country_code"].unique()).codes
data_pcl_dev["keyword"] = pd.Categorical(data_pcl_dev["keyword"], categories=data_pcl_dev["keyword"].unique()).codes

In [15]:
data_pcl_dev

Unnamed: 0,keyword,country_code,binary_label,length_text,0,1,2,3,4,5,...,290,291,292,293,294,295,296,297,298,299
106,0,0,1.0,400,0.040616,0.053745,0.023178,0.041561,-0.088367,-0.019397,...,-0.099189,0.011189,-0.091196,0.033318,-0.042895,0.023637,0.004139,-0.033890,0.042187,-0.028945
148,1,1,1.0,296,0.029581,0.035252,0.010618,0.084094,-0.045408,0.003664,...,-0.077357,-0.016581,-0.119857,-0.035556,-0.050116,0.014409,0.041216,-0.043842,0.042561,-0.068234
150,2,2,1.0,138,0.031295,0.028444,0.036345,0.156867,-0.032007,-0.030412,...,-0.099141,0.000726,-0.069391,0.007742,0.045618,0.049137,-0.117510,-0.025179,0.081567,0.045866
153,1,3,1.0,496,0.063602,0.054059,0.019293,0.100153,-0.101033,0.012160,...,-0.052560,0.050858,-0.094168,-0.001755,-0.024271,0.010481,-0.011790,-0.051126,0.047470,-0.020584
156,2,4,1.0,601,0.030542,0.035941,-0.033920,0.073037,-0.067282,-0.003469,...,-0.042028,-0.015475,-0.095195,0.016387,-0.021685,-0.000462,-0.002560,-0.008348,0.067242,-0.031580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10462,5,13,0.0,172,0.025852,0.102847,0.078958,0.054771,-0.060840,-0.038958,...,-0.067443,0.030520,-0.068782,0.016903,-0.085384,0.060193,0.024922,-0.011949,0.086685,-0.015989
10463,1,7,0.0,203,0.074717,0.018960,0.059744,0.018926,-0.030449,-0.047268,...,-0.085388,-0.052651,-0.112657,0.062949,-0.039978,0.004479,0.005023,-0.034573,0.063049,0.005879
10464,4,6,0.0,348,0.021482,0.087925,0.024637,0.040627,-0.003054,0.010264,...,-0.093884,0.024587,-0.097566,-0.041327,-0.041133,0.005601,0.064697,-0.023188,0.043588,-0.010489
10465,9,10,0.0,258,-0.007844,0.078821,0.024941,0.076679,-0.073365,-0.037882,...,-0.113297,0.021043,-0.074065,0.000114,-0.050973,0.071761,0.008267,-0.050337,0.029896,-0.039081


In [16]:
from sklearn.metrics import f1_score

y_pred = svm_pipeline.predict(X_train)
print(f"F1-score on train:{f1_score(Y_train, y_pred, average='macro')}")


F1-score on train:0.6873309626619253


In [17]:
Y_test = data_pcl_dev["binary_label"].astype("float").to_numpy()
X_test = data_pcl_dev.drop(['binary_label'], axis=1, inplace=False).astype("float").to_numpy()

y_pred = svm_pipeline.predict(X_test)
print(f"F1-score on dev:{f1_score(Y_test, y_pred, average='macro')}")

F1-score on dev:0.5411372329508349


In [18]:
import pandas as pd
import numpy as np
import os
from nltk.stem import PorterStemmer
import re 

df = pd.read_csv("./data/dontpatronizeme_pcl.csv")
df = df.dropna()

train_id = pd.read_csv("./data/train_semeval_parids-labels.csv")
dev_id = pd.read_csv("./data/dev_semeval_parids-labels.csv")

data_pcl_train = df[df["par_id"].isin(train_id["par_id"].tolist())]
data_pcl_dev = df[df["par_id"].isin(dev_id["par_id"].tolist())]


In [19]:
print(df.shape)
print(train_id.shape)
print(dev_id.shape)

(10467, 7)
(8375, 2)
(2094, 2)


In [20]:
data_pcl_dev

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label
106,107,16900972,homeless,ke,"""His present """" chambers """" may be quite humbl...",3.0,1.0
148,149,1387882,disabled,us,Krueger recently harnessed that creativity to ...,2.0,1.0
150,151,19974860,poor-families,in,10:41am - Parents of children who died must ge...,3.0,1.0
153,154,20663936,disabled,ng,When some people feel causing problem for some...,4.0,1.0
156,157,21712008,poor-families,ca,We are alarmed to learn of your recently circu...,4.0,1.0
...,...,...,...,...,...,...,...
10462,10463,4676355,refugee,pk,""""""" The Pakistani police came to our house and...",0.0,0.0
10463,10464,19612634,disabled,ie,"""When Marie O'Donoghue went looking for a spec...",0.0,0.0
10464,10465,14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",1.0,0.0
10465,10466,70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0.0,0.0


In [21]:
classifier_bow = BagOfWordsClassifier()
classifier_bow.train(data_pcl_train)

In [30]:
import pandas as pd
from sklearn.metrics import f1_score

y_pred = classifier_bow.predict(data_pcl_train['text'])
print(f"F1-score on train: {f1_score(data_pcl_train['binary_label'], y_pred, average='macro')}")

data_pcl_train['predicted_label'] = y_pred
misclassified = data_pcl_train[data_pcl_train['binary_label'] != data_pcl_train['predicted_label']]

# 显示分类错误的数据
print("Misclassified samples:")
print(misclassified)


F1-score on train: 0.4864682403042344
Misclassified samples:
       par_id    art_id     keyword country_code  \
8           9   3449225    homeless           ph   
12         13  13499386       women           pk   
16         17   8886184    hopeless           sg   
22         23  25154641    homeless           ca   
25         26   3659219       women           za   
...       ...       ...         ...          ...   
10406   10407   1811952     in-need           tz   
10407   10408   4542224    homeless           my   
10423   10424   4665292       women           jm   
10453   10454  22338535  vulnerable           ie   
10466   10467  20282330     in-need           ng   

                                                    text  label  binary_label  \
8      NUEVA ERA , Ilocos Norte - No family shall be ...    1.0           0.0   
12     """ Ghostbusters "" is a resurrection of the 1...    0.0           0.0   
16     For those few seconds , humanity is free of it...    0.0        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pcl_train['predicted_label'] = y_pred


In [31]:
data_pcl_train

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label,predicted_label
0,1,24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0.0,0.0,0
1,2,21968160,migrant,gh,"In Libya today , there are countless number of...",0.0,0.0,0
2,3,16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0.0,0.0,0
3,4,7811231,disabled,nz,Council customers only signs would be displaye...,0.0,0.0,0
4,5,1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0.0,0.0,0
...,...,...,...,...,...,...,...,...
10407,10408,4542224,homeless,my,""""""" Most of them ( the homeless ) have the abi...",3.0,1.0,0
10423,10424,4665292,women,jm,""""""" I do n't believe in abortion , I think it ...",3.0,1.0,0
10444,10445,3923193,refugee,gb,More than 150 volunteers spent the night in ' ...,3.0,1.0,1
10453,10454,22338535,vulnerable,ie,""""""" We are challenged , I suggest , to turn th...",4.0,1.0,0


In [27]:
y_pred = classifier_bow.predict(data_pcl_dev['text'])
print(len(y_pred))
print(f"F1-score on dev:{f1_score(Y_test, y_pred, average='macro')}")

2093
F1-score on dev:0.4710545826540191
