In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('wordnet')


In [None]:
df_train=pd.read_csv('labels_racism.csv', sep="|")

In [None]:
def cleaned_df(df, field):
 df[field] = df[field].str.lower()
 df[field] = df[field].str.strip("\t")
 df[field] = df[field].str.replace("r[^A-Za-z()]", " ")
 df[field] = df[field].str.replace(r"[\'-`\"\n:;*!+=?¿_]", " ")
 df[field] = df[field].str.replace(r"\(.*\)","")
 df[field] = df[field].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
 df[field] = df[field].str.replace('https?:\/\/(www\.)?', '')
 df[field] = df[field].str.replace('<.*?>+', '')
 df[field] = df[field].str.replace('\n', '')
 df[field] = df[field].str.replace('\w*\d\w*', '')
 df[field] = df[field].str.replace(r'\W', ' ')

 return df

def remove_short_words(df, short_len=3):
    def get_corpus(text_column):
        words = []
        for i in text_column:
            for j in i.split():
                words.append(j.strip())
        return words
    
    df = df.copy()
    corpus = get_corpus(df['message'])
    counter = Counter(corpus)
    most_common = counter.most_common(5)
    most_common = dict(most_common)
    counter.most_common()[::-1]
    k=len(counter.most_common())
    l=[]
    for i in range(k):
  
        if len(counter.most_common()[i][0]) < short_len:
            l.append(counter.most_common()[i][0])
    
    df["message"] = df["message"].apply(lambda row: ' '.join([word for word in row.split() if word not in l]))
    return df

In [None]:
df_test=pd.read_csv('evaluation_sample.csv', sep="|")d
df_test= prepare_labels(df_test, is_train=False)
df_train= cleaned_df(df_train, "message")

In [None]:
df_train= remove_short_words(df_train, short_len=3)

Unnamed: 0,message,label
0,diariolibreusa nia aos aboga inmigrantes unido...,0
1,finales obra arte desmaterializa deci alejamos...,0
2,alerta machista sito vive lacra sabe cuanto in...,1
3,ahora vegano llevamos tres semanas aqu maana h...,0
4,claro queremos liga tene relaciones placentera...,0
...,...,...
5667,virus racista vuelven negros luego mueren,1
5668,hollywood exigi negros latinos lgtbi cuota obl...,1
5669,podemos exige espaa pida perdn negros mundo,0
5670,pruebas anticuerpos pueden falsos negativos ha...,0


In [None]:
stops=set(stopwords.words('spanish'))
df_train["message"] = df_train["message"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stops)]))
print(df_train["message"])

0                                                                                                                          diariolibreusa nia aos aboga po inmigrantes unidos diariolibre inmigrantes eleccioneseeuu
1                                                       finales obra arte desmaterializa deci alejamos objeto fsico artstico obras concentran pblico entorno reacciones tomaszarza precursores performance espaa zaj
2       alerta machista pa sito vive lacra sabe cuanto ms inmigracin ilegal delincuentes cuya religin defiende machismo mejo po condena ningn crimen inmigrantes consta nacionalidades falsofeminismo laestenpeligro
3                                             ahora vegano llevamos tres semanas aqu maana haba conocido verdadero horro trabajo munirhachemi trabajo precario industria crnica culpas animal situacin all luga jefe
4                                                       claro queremos liga tene relaciones placenteras variadas queremos costa incomodidad siempre 

In [None]:
sample=pd.read_csv('evaluation_sample.csv', sep="|")
x_sample= sample["message"]
y_sample= sample["label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train["message"], df_train["label"], test_size=0.20, random_state=15)

In [None]:
classifier = MultinomialNB(alpha=2)
vectorizer=CountVectorizer(ngram_range=(1, 3))

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

model.fit(X_train, y_train)
predsMultinomialNB = model.predict(X_test)

In [None]:
MultinomialNB_score=accuracy_score(predsMultinomialNB, y_test)
MultinomialNB_presision =precision_score(predsMultinomialNB, y_test, average="weighted")
MultinomialNB_f1= f1_score(predsMultinomialNB, y_test, average='weighted')
print(f"Accuracy:{MultinomialNB_score}")
print(f"Presisio: {MultinomialNB_presision}")
print(f"f1_score: {MultinomialNB_f1}")

Accuracy:0.762114537444934
Presisio: 0.8058765735167187
f1_score: 0.7625659091791477


In [None]:
classifier = MultinomialNB(alpha=2)
vectorizer=TfidfVectorizer(ngram_range=(1, 3))

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

model.fit(X_train, y_train)
predsMultinomialNBVec = model.predict(X_test)

In [None]:
MultinomialNB_score=accuracy_score(predsMultinomialNBVec, y_test)
MultinomialNB_presision =precision_score(predsMultinomialNBVec, y_test, average="weighted")
MultinomialNB_f1= f1_score(predsMultinomialNBVec, y_test, average='weighted')
print(f"Accuracy:{MultinomialNB_score}")
print(f"Presisio: {MultinomialNB_presision}")
print(f"f1_score: {MultinomialNB_f1}")

Accuracy:0.7779735682819383
Presisio: 0.7793693602806829
f1_score: 0.7785156222285364


In [None]:
classifier = MultinomialNB(alpha=2)
vectorizer=TfidfVectorizer(ngram_range=(1, 3))

model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

model.fit(X_train, y_train)
predsMultinomialNBVecSample = model.predict(x_sample)

In [None]:
MultinomialNB_scoreSample=accuracy_score(predsMultinomialNBVecSample, y_sample)
MultinomialNB_presisionSample =precision_score(predsMultinomialNBVecSample, y_sample, average="weighted")
MultinomialNBSample_f1= f1_score(predsMultinomialNBVecSample, y_sample, average='weighted')
print(f"AccuracySampple:{MultinomialNB_scoreSample}")
print(f"PresisioSampple: {MultinomialNB_presisionSample}")
print(f"f1_scoreSampple: {MultinomialNBSample_f1}")

AccuracySampple:0.8135593220338984
PresisioSampple: 0.8203860640301318
f1_scoreSampple: 0.8148688263567507


In [None]:
df = pd.DataFrame()

df['predsMultinomialNBVec'] = predsMultinomialNBVec
df['predsMultinomialNB']= predsMultinomialNB
df['predsLogisticVec']= predsLogisticVec
df['real']= df_test["label"]

In [None]:
df

Unnamed: 0,predsMultinomialNBVec,predsMultinomialNB,predsLogisticVec,real
0,0,0,0,0
1,1,1,1,1
2,0,0,0,0
3,1,1,1,0
4,1,1,1,0
5,1,1,1,1
6,0,0,0,0
7,0,1,1,0
8,1,1,1,1
9,1,1,1,1
