In [1]:
import os
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [2]:
'''
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
'''

"\nnltk.download('punkt')\nnltk.download('punkt_tab')\nnltk.download('stopwords')\nnltk.download('wordnet')\n"

In [3]:
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, '..', 'data')
file_path = os.path.join(data_dir, 'youtoxic_english_1000.csv')

In [4]:
df = pd.read_csv(file_path)

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
def preprocess_text(text):
    text = text.lower()

    text = re.sub(f"[{string.punctuation}]", "", text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

df['Text'] = df['Text'].apply(preprocess_text)

print(df.head())

              CommentId      VideoId  \
0  Ugg2KwwX0V8-aXgCoAEC  04kJtp6pVXI   
1  Ugg2s5AzSPioEXgCoAEC  04kJtp6pVXI   
2  Ugg3dWTOxryFfHgCoAEC  04kJtp6pVXI   
3  Ugg7Gd006w1MPngCoAEC  04kJtp6pVXI   
4  Ugg8FfTbbNF8IngCoAEC  04kJtp6pVXI   

                                                Text  IsToxic  IsAbusive  \
0  people would take step back make case wasnt an...    False      False   
1  law enforcement trained shoot apprehend traine...     True       True   
2  dont reckon black life matter banner held whit...     True       True   
3  large number people like police officer called...    False      False   
4  arab dude absolutely right shot 6 extra time s...    False      False   

   IsThreat  IsProvocative  IsObscene  IsHatespeech  IsRacist  IsNationalist  \
0     False          False      False         False     False          False   
1     False          False      False         False     False          False   
2     False          False       True         False     False 

In [7]:

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
hate_speech_columns = [col for col in df.columns if col.startswith('Is')]
df['IsHateSpeech'] = df[hate_speech_columns].any(axis=1).astype(int)
df.drop(columns=hate_speech_columns, inplace=True)
print(df)

                CommentId      VideoId  \
0    Ugg2KwwX0V8-aXgCoAEC  04kJtp6pVXI   
1    Ugg2s5AzSPioEXgCoAEC  04kJtp6pVXI   
2    Ugg3dWTOxryFfHgCoAEC  04kJtp6pVXI   
3    Ugg7Gd006w1MPngCoAEC  04kJtp6pVXI   
4    Ugg8FfTbbNF8IngCoAEC  04kJtp6pVXI   
..                    ...          ...   
995  Ugi5ADt10EdDz3gCoAEC  XRuCW80L9mA   
996  Ugifh2DMhBbDkHgCoAEC  XRuCW80L9mA   
997  Ugj_plbGBjjzYXgCoAEC  XRuCW80L9mA   
998  Ugj0bah1De8xy3gCoAEC  XRuCW80L9mA   
999  UgjBJKQSoQMQ6ngCoAEC  XRuCW80L9mA   

                                                  Text  IsHateSpeech  
0    people would take step back make case wasnt an...             0  
1    law enforcement trained shoot apprehend traine...             1  
2    dont reckon black life matter banner held whit...             1  
3    large number people like police officer called...             0  
4    arab dude absolutely right shot 6 extra time s...             0  
..                                                 ...           ... 

In [10]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['Text']) 

In [13]:
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())  # Convert the sparse matrix to DataFrame
X_df['IsHateSpeech'] = df['IsHateSpeech']

In [14]:
 X_df.to_csv(os.path.join(data_dir, 'preprocessed_data.csv'), index=False)

In [15]:
vectorizer_filename = os.path.join(data_dir, 'tfidf_vectorizer.joblib')

# Save the vectorizer to the specified directory
joblib.dump(vectorizer, vectorizer_filename)

['c:\\Users\\iryna\\Desktop\\NLP_Youtube_9\\notebooks\\..\\data\\tfidf_vectorizer.joblib']