In [239]:
import os
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [240]:
'''
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
'''

"\nnltk.download('punkt')\nnltk.download('punkt_tab')\nnltk.download('stopwords')\nnltk.download('wordnet')\n"

In [241]:
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, '..', 'data')
file_path = os.path.join(data_dir, 'youtoxic_english_1000.csv')

In [242]:
df = pd.read_csv(file_path)

In [243]:
lemmatizer = WordNetLemmatizer()

In [244]:
def preprocess_text(text):
    text = text.lower()

    text = re.sub(f"[{string.punctuation}]", "", text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

df['Text'] = df['Text'].apply(preprocess_text)

In [245]:

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [246]:
lemmatizer = WordNetLemmatizer()

In [247]:
hate_speech_columns = ['IsHatespeech', 'IsRacist', 'IsNationalist', 'IsSexist', 'IsHomophobic', 'IsReligiousHate', 'IsRadicalism']
offensive_language_columns = ['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative']

In [248]:
def map_category(df):
    category_mapping = {0: 'Neither', 1: 'Hate Speech', 2: 'Offensive Language'}

    # Create the IsHateSpeech column
    df['IsHateSpeech'] = df[hate_speech_columns].any(axis=1).astype(int)

    # Create the IsOffensive column
    df['IsOffensive'] = df[offensive_language_columns].any(axis=1).astype(int)

    for index, row in df.iterrows():
        if row['IsHateSpeech'] == 1:
            df.at[index, 'Category'] = 1
        elif row['IsOffensive'] == 1:
            df.at[index, 'Category'] = 2
        else:
            df.at[index, 'Category'] = 0
    
    df['Category'] = df['Category'].map(category_mapping)

    return df
# Call the map_category function to update the original DataFrame
df = map_category(df)

# Get the count of positive values in IsHateSpeech and IsOffensive columns
hate_speech_count = df['IsHateSpeech'].sum()
offensive_count = df['IsOffensive'].sum()

print(f"Number of positive hate speech samples: {hate_speech_count}")
print(f"Number of positive offensive language samples: {offensive_count}")

# Print the unique categories and their counts
print(df['Category'].value_counts())


Number of positive hate speech samples: 138
Number of positive offensive language samples: 462
Category
Neither               538
Offensive Language    324
Hate Speech           138
Name: count, dtype: int64


In [249]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['Text']) 

In [250]:
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()) 
X_df['IsHateSpeech'] = df['IsHateSpeech']
X_df['IsOffensive'] = df['IsOffensive']
X_df['Category'] = df['Category']
df.drop(columns=hate_speech_columns + offensive_language_columns + ['Text'] + ['IsOffensive'] + ['IsHateSpeech'], inplace=True)


In [251]:
output_file = os.path.join(data_dir, 'preprocessed_data_triclase.csv')
X_df.to_csv(output_file, index=False)

In [252]:
vectorizer_filename = os.path.join(data_dir, 'tfidf_vectorizer_triclase.joblib')

# Save the vectorizer to the specified directory
joblib.dump(vectorizer, vectorizer_filename)

['c:\\Users\\iryna\\Desktop\\NLP_Youtube_9\\notebooks\\..\\data\\tfidf_vectorizer_triclase.joblib']