# Class weights method for data balancing

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
df_t = pd.read_csv('final_dataset_for_model.csv')

In [20]:
label_encoder = LabelEncoder()
df_t['label_encoded'] = label_encoder.fit_transform(df_t['labels'])

In [21]:
X = df_t[['anonymized_message', 'level', 'thread', 'class']]  
y = df_t['label_encoded']

In [22]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(X['anonymized_message'])

In [23]:
X_categorical = pd.DataFrame({
    'level': label_encoder.fit_transform(X['level']),
    'thread': label_encoder.fit_transform(X['thread']),
    'class': label_encoder.fit_transform(X['class'])
})

In [24]:
from scipy.sparse import hstack
X_combined = hstack([X_text, X_categorical])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [26]:
# Calculate class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

In [27]:
# Create a dictionary mapping class labels to weights
class_weight_dict = dict(enumerate(class_weights))
print("Class Weights:", class_weight_dict)

Class Weights: {0: 1.7338732866864757, 1: 0.4623868922856705, 2: 1.379508884313277, 3: 1.8668214654282766}


In [28]:
# Train the Random Forest Classifier with class weights
model = RandomForestClassifier(class_weight=class_weight_dict, random_state=42)
model.fit(X_train, y_train)

In [29]:
# Evaluate the model
y_pred = model.predict(X_test)

In [30]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.48      0.50      0.49      5336
           1       0.96      0.48      0.64     19390
           2       0.92      0.61      0.73      6518
           3       0.25      0.85      0.39      4936

    accuracy                           0.56     36180
   macro avg       0.65      0.61      0.56     36180
weighted avg       0.78      0.56      0.60     36180

[[2644  142   42 2508]
 [1850 9271  256 8013]
 [ 497  104 3973 1944]
 [ 539  162   25 4210]]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']
}

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='f1_macro', cv=3, verbose=2, n_jobs=-1)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best Parameters: ", grid_search.best_params_)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
