In [10]:
import re
import pandas as pd

import spacy
from sklearn.model_selection import train_test_split

data = pd.read_csv("../data/data_clean_one_col.csv")

X = data.drop("IsHate", axis=1)
y = data["IsHate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nlp = spacy.load("en_core_web_sm")

def preprocessing(text: str):
    text = text.lower().strip()
    text = re.sub("([^\x00-\x7F])+"," ",text)
    text = text.replace("\n", " ")
    text = re.sub(" +", " ", text)
    text = re.sub('[0-9]',"", text)
    doc = nlp(text)
    clean_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    text_pre = " ".join(clean_tokens)
    return text_pre

In [11]:
X_train["Text"] = X_train["Text"].apply(preprocessing)
X_train.head(10)

Unnamed: 0,Text
29,blood street worried word get tired way mainst...
535,care cop somebody tell dumb ass girl unarmed w...
695,dumb fuck screw fellow citizen instead protest...
557,non lethal weapon control crowd lol love rogan...
836,yea beat white people rob black own store caus...
596,store owner pay cigar fn innocent ass kid stole
165,white child deserve future ten million non w...
918,fuck mo kansas city
495,michael brown home angel street devil altar bo...
824,tell taboo talk news medium will touch foot po...


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train["Text"]).toarray()
X_train_vect

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])