In [None]:
import os

if "notebooks" in os.getcwd():
    %cd ..
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud

from src.data import preprocessing

In [None]:
data_root = Path("data", "edos_raw").resolve().as_posix()

In [None]:
train_df = pd.read_csv(os.path.join(data_root, "train_all_tasks.csv"))
dev_task_a = pd.read_csv(os.path.join(data_root, "dev_task_a_entries.csv"))
dev_task_b = pd.read_csv(os.path.join(data_root, "dev_task_b_entries.csv"))
dev_task_c = pd.read_csv(os.path.join(data_root, "dev_task_c_entries.csv"))

In [None]:
train_df.head()

In [None]:
preprocessor = preprocessing.TextPreprocessor()

In [None]:
test = train_df["text"].iloc[0:10]

In [None]:
ar = train_df["text"].to_numpy()

In [None]:
for i in ar:
    print(ar)

In [None]:
preprocessor.transform(ar[0])

In [None]:
%%time
res = preprocessor.transform_series(train_df["text"])

In [None]:
res

In [None]:
train_df["processed_text"] = res

In [None]:
train_df.isna().sum()

In [None]:
text = " ".join(row for row in train_df["processed_text"])

In [None]:
word_cloud = WordCloud(collocations=False, background_color="white").generate(text)

In [None]:
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
sns.histplot(train_df["label_sexist"], discrete=True)

In [None]:
train_df["label_sexist"] = train_df["label_sexist"].map({"not sexist": 0, "sexist": 1})
train_df["label_sexist"].astype(int)
train_df

In [None]:
train_df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df["processed_text"], train_df["label_sexist"], test_size=0.3, random_state=42
)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
gnb_clf = MultinomialNB()
gnb_clf.fit(X_train, y_train)
y_pred = gnb_clf.predict(X_test)
print(classification_report(y_test, y_pred))