In [12]:
import os
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [13]:
'''
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
'''

"\nnltk.download('punkt')\nnltk.download('punkt_tab')\nnltk.download('stopwords')\nnltk.download('wordnet')\n"

In [14]:
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, '..', 'data')
file_path = os.path.join(data_dir, 'youtoxic_english_1000.csv')

In [15]:
df = pd.read_csv(file_path)

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess_text(text):
    text = text.lower()

    text = re.sub(f"[{string.punctuation}]", "", text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

df['Text'] = df['Text'].apply(preprocess_text)

print(df.head())

              CommentId      VideoId  \
0  Ugg2KwwX0V8-aXgCoAEC  04kJtp6pVXI   
1  Ugg2s5AzSPioEXgCoAEC  04kJtp6pVXI   
2  Ugg3dWTOxryFfHgCoAEC  04kJtp6pVXI   
3  Ugg7Gd006w1MPngCoAEC  04kJtp6pVXI   
4  Ugg8FfTbbNF8IngCoAEC  04kJtp6pVXI   

                                                Text  IsToxic  IsAbusive  \
0  people would take step back make case wasnt an...    False      False   
1  law enforcement trained shoot apprehend traine...     True       True   
2  dont reckon black life matter banner held whit...     True       True   
3  large number people like police officer called...    False      False   
4  arab dude absolutely right shot 6 extra time s...    False      False   

   IsThreat  IsProvocative  IsObscene  IsHatespeech  IsRacist  IsNationalist  \
0     False          False      False         False     False          False   
1     False          False      False         False     False          False   
2     False          False       True         False     False 

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
import joblib

In [19]:
target_columns = [col for col in df.columns if col.startswith('Is')]
vectorizers = {}

In [None]:
all_train_data = pd.DataFrame()
all_test_data = pd.DataFrame()

for target_column in target_columns:
    print(f"Processing target column: {target_column}")

    X = df['Text']
    y = df[target_column]  

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Initialize TF-IDF Vectorizer
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

    # Fit and transform the training data, transform the test data
    X_train_tfidf = tfidf.fit_transform(X_train) 
    X_test_tfidf = tfidf.transform(X_test)  

    # Save the vectorizer for later use
    vectorizers[target_column] = tfidf

    # Apply ADASYN to balance the training data (oversample minority class)
    adasyn = ADASYN(sampling_strategy='minority', n_neighbors=3, random_state=42)
    X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_tfidf, y_train)

    # Convert the resampled data to a DataFrame for train and test sets
    train_data = pd.DataFrame(X_train_resampled.toarray(), columns=tfidf.get_feature_names_out())  # Convert sparse matrix to array
    train_data[target_column] = y_train_resampled

    test_data = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf.get_feature_names_out())  # Convert sparse matrix to array
    test_data[target_column] = y_test

    # Append the train and test data to the overall DataFrames
    all_train_data = pd.concat([all_train_data, train_data], axis=1)
    all_test_data = pd.concat([all_test_data, test_data], axis=1)

    print(f"Train and test data for {target_column} processed!")

# Save the concatenated train and test data for all target columns
train_file_path = os.path.join(data_dir, 'all_train_data.csv')
test_file_path = os.path.join(data_dir, 'all_test_data.csv')

# Save as CSV
all_train_data.to_csv(train_file_path, index=False)
all_test_data.to_csv(test_file_path, index=False)

# Save the vectorizers for all target columns
vectorizer_file_path = os.path.join(data_dir, 'all_tfidf_vectorizers.joblib')
joblib.dump(vectorizers, vectorizer_file_path)

print(f"All target columns have been processed, vectorized, oversampled, and saved!")
print(f"Train data saved to: {train_file_path}")
print(f"Test data saved to: {test_file_path}")
print(f"All TF-IDF vectorizers have been saved to: {vectorizer_file_path}")

Processing target column: IsToxic
Train and test data for IsToxic processed!
Processing target column: IsAbusive
Train and test data for IsAbusive processed!
Processing target column: IsThreat
Train and test data for IsThreat processed!
Processing target column: IsProvocative
Train and test data for IsProvocative processed!
Processing target column: IsObscene
Train and test data for IsObscene processed!
Processing target column: IsHatespeech
Train and test data for IsHatespeech processed!
Processing target column: IsRacist
Train and test data for IsRacist processed!
Processing target column: IsNationalist


ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 6, n_samples_fit = 4, n_samples = 4