In [38]:
import os
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [39]:
'''
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
'''

"\nnltk.download('punkt')\nnltk.download('punkt_tab')\nnltk.download('stopwords')\nnltk.download('wordnet')\n"

In [40]:
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, '..', 'data')
file_path = os.path.join(data_dir, 'youtoxic_english_1000.csv')

In [41]:
df = pd.read_csv(file_path)

In [42]:
lemmatizer = WordNetLemmatizer()

In [43]:
def preprocess_text(text):
    text = text.lower()

    text = re.sub(f"[{string.punctuation}]", "", text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

df['Text'] = df['Text'].apply(preprocess_text)

print(df.head())

              CommentId      VideoId  \
0  Ugg2KwwX0V8-aXgCoAEC  04kJtp6pVXI   
1  Ugg2s5AzSPioEXgCoAEC  04kJtp6pVXI   
2  Ugg3dWTOxryFfHgCoAEC  04kJtp6pVXI   
3  Ugg7Gd006w1MPngCoAEC  04kJtp6pVXI   
4  Ugg8FfTbbNF8IngCoAEC  04kJtp6pVXI   

                                                Text  IsToxic  IsAbusive  \
0  people would take step back make case wasnt an...    False      False   
1  law enforcement trained shoot apprehend traine...     True       True   
2  dont reckon black life matter banner held whit...     True       True   
3  large number people like police officer called...    False      False   
4  arab dude absolutely right shot 6 extra time s...    False      False   

   IsThreat  IsProvocative  IsObscene  IsHatespeech  IsRacist  IsNationalist  \
0     False          False      False         False     False          False   
1     False          False      False         False     False          False   
2     False          False       True         False     False 

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
import joblib

In [55]:
target_columns = [col for col in df.columns if col.startswith('Is')]
vectorizers = {}
for target_column in target_columns:
    if df[target_column].nunique() == 1:  # Only one unique value (False)
        print(f"Removing column {target_column} due to lack of variability.")
        df.drop(target_column, axis=1, inplace=True)
        df.drop('IsSexist', axis=1, inplace=True)

In [56]:
for target_column in target_columns:
    print(f"Processing target column: {target_column}")

    X = df['Text']
    y = df[target_column]

    class_counts = y.value_counts(normalize=True) * 100
    print(f"Class distribution for {target_column}:")
    print(class_counts)

    if class_counts.max() > 90:
        print(f"Skipping resampling for {target_column} due to extreme imbalance.")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
        tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
        X_train_tfidf = tfidf.fit_transform(X_train)
        X_test_tfidf = tfidf.transform(X_test)
        train_data = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())
        train_data[target_column] = y_train
        test_data = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf.get_feature_names_out())
        test_data[target_column] = y_test

    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

        tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
        X_train_tfidf = tfidf.fit_transform(X_train)
        X_test_tfidf = tfidf.transform(X_test)

        smote_tomek = SMOTETomek(sampling_strategy='minority', random_state=42)
        X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train_tfidf, y_train)

        train_data = pd.DataFrame(X_train_resampled.toarray(), columns=tfidf.get_feature_names_out())
        train_data[target_column] = y_train_resampled

        test_data = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf.get_feature_names_out())
        test_data[target_column] = y_test

    vectorizers[target_column] = tfidf

    all_train_data = pd.concat([all_train_data, train_data], axis=1)
    all_test_data = pd.concat([all_test_data, test_data], axis=1)

    print(f"Train and test data for {target_column} processed!")

train_file_path = os.path.join(data_dir, 'dropped_train_data.csv')
test_file_path = os.path.join(data_dir, 'dropped_test_data.csv')

all_train_data.to_csv(train_file_path, index=False)
all_test_data.to_csv(test_file_path, index=False)

vectorizer_file_path = os.path.join(data_dir, 'all_tfidf_vectorizers.joblib')
joblib.dump(vectorizers, vectorizer_file_path)

print(f"All target columns have been processed, vectorized, oversampled, and saved!")
print(f"Train data saved to: {train_file_path}")
print(f"Test data saved to: {test_file_path}")
print(f"All TF-IDF vectorizers have been saved to: {vectorizer_file_path}")

Processing target column: IsToxic
Class distribution for IsToxic:
IsToxic
False    53.8
True     46.2
Name: proportion, dtype: float64
Train and test data for IsToxic processed!
Processing target column: IsAbusive
Class distribution for IsAbusive:
IsAbusive
False    64.7
True     35.3
Name: proportion, dtype: float64
Train and test data for IsAbusive processed!
Processing target column: IsThreat
Class distribution for IsThreat:
IsThreat
False    97.9
True      2.1
Name: proportion, dtype: float64
Skipping resampling for IsThreat due to extreme imbalance.
Train and test data for IsThreat processed!
Processing target column: IsProvocative
Class distribution for IsProvocative:
IsProvocative
False    83.9
True     16.1
Name: proportion, dtype: float64
Train and test data for IsProvocative processed!
Processing target column: IsObscene
Class distribution for IsObscene:
IsObscene
False    90.0
True     10.0
Name: proportion, dtype: float64
Train and test data for IsObscene processed!
Process