In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
train_url = '/kaggle/input/please/train_shuffled.csv'
test_url = '/kaggle/input/please/test_shuffled.csv'
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
stop_words_df = pd.read_excel('/kaggle/input/bangla-stopwords/stopwords_bangla.xlsx',index_col=False)

In [4]:
STOPWORDS = set([word.strip() for word in stop_words_df['words']])

In [6]:
import re
def preprocess(x):
    html_pattern = re.compile('<.*?>')
    x = html_pattern.sub(r'', x)
    x = " ".join([word for word in str(x).split() if word not in STOPWORDS])
    return x
df_train['Comment'] = df_train['Comment'].apply(lambda x: preprocess(x))
df_test['Comment'] = df_test['Comment'].apply(lambda x:preprocess(x))

In [7]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(df_train.Error)
Test_Y = Encoder.fit_transform(df_test.Error)

In [8]:
df_all  = pd.concat([df_train, df_test], ignore_index=True)
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(df_all['Comment'])
Train_X_Tfidf = Tfidf_vect.transform(df_train['Comment'])
Test_X_Tfidf = Tfidf_vect.transform(df_test['Comment'])

In [14]:
model = XGBClassifier(
    n_estimators=700,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=4,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

In [15]:
model.fit(Train_X_Tfidf, Train_Y)

In [17]:
y_pred = model.predict(Test_X_Tfidf)

In [18]:
print(metrics.classification_report(Test_Y, y_pred,digits = 4))

              precision    recall  f1-score   support

           0     0.5576    0.3494    0.4296      1926
           1     0.6787    0.8321    0.7476      3181

    accuracy                         0.6501      5107
   macro avg     0.6181    0.5908    0.5886      5107
weighted avg     0.6330    0.6501    0.6277      5107



In [20]:
model = XGBClassifier(
    n_estimators=700,  # Number of boosting rounds (trees)
    learning_rate=0.01,  # Step size shrinkage to prevent overfitting
    max_depth=4,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred,digits = 4))

              precision    recall  f1-score   support

           0     0.6403    0.0924    0.1615      1926
           1     0.6380    0.9686    0.7693      3181

    accuracy                         0.6381      5107
   macro avg     0.6392    0.5305    0.4654      5107
weighted avg     0.6389    0.6381    0.5401      5107



In [61]:
df_all  = pd.concat([df_train, df_test], ignore_index=True)
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(df_all['Comment'])
Train_X_Tfidf = Tfidf_vect.transform(df_train['Comment'])
Test_X_Tfidf = Tfidf_vect.transform(df_test['Comment'])

In [62]:
model = XGBClassifier(
    n_estimators=100,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=3,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred,digits = 4))

              precision    recall  f1-score   support

           0     0.5222    0.0738    0.1294      1910
           1     0.6277    0.9585    0.7586      3112

    accuracy                         0.6221      5022
   macro avg     0.5750    0.5162    0.4440      5022
weighted avg     0.5876    0.6221    0.5193      5022



In [21]:
model = XGBClassifier(
    n_estimators=500,  # Number of boosting rounds (trees)
    learning_rate=0.05,  # Step size shrinkage to prevent overfitting
    max_depth=4,        # Maximum depth of each tree
    subsample=0.7,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.7,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred,digits = 4))

              precision    recall  f1-score   support

           0     0.5905    0.2321    0.3332      1926
           1     0.6600    0.9025    0.7624      3181

    accuracy                         0.6497      5107
   macro avg     0.6252    0.5673    0.5478      5107
weighted avg     0.6338    0.6497    0.6006      5107



In [22]:

# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,      # Increased boosting rounds
    learning_rate=0.05,    # Lower learning rate
    max_depth=5,           # Increased maximum depth
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,         # L1 regularization term
    reg_lambda=0.1,        # L2 regularization term
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6036    0.1890    0.2879      1926
           1     0.6532    0.9249    0.7656      3181

    accuracy                         0.6473      5107
   macro avg     0.6284    0.5569    0.5268      5107
weighted avg     0.6345    0.6473    0.5855      5107



In [23]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,      # Increased boosting rounds
    learning_rate=0.01,    # Lower learning rate
    max_depth=4,           # Increased maximum depth
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,         # L1 regularization term
    reg_lambda=0.1,        # L2 regularization term
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6538    0.0353    0.0670      1926
           1     0.6286    0.9887    0.7686      3181

    accuracy                         0.6291      5107
   macro avg     0.6412    0.5120    0.4178      5107
weighted avg     0.6381    0.6291    0.5040      5107



In [24]:

# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=3,   # Experiment with different values
    gamma=0.05,           # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6003    0.1926    0.2917      1926
           1     0.6536    0.9224    0.7651      3181

    accuracy                         0.6472      5107
   macro avg     0.6270    0.5575    0.5284      5107
weighted avg     0.6335    0.6472    0.5865      5107



In [25]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=1,   # Experiment with different values
    gamma=0.1,            # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6030    0.1869    0.2854      1926
           1     0.6528    0.9255    0.7656      3181

    accuracy                         0.6470      5107
   macro avg     0.6279    0.5562    0.5255      5107
weighted avg     0.6340    0.6470    0.5845      5107



In [26]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.005,  # Lower learning rate
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=1,   # Experiment with different values
    gamma=0.15,           # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6768    0.0348    0.0662      1926
           1     0.6288    0.9899    0.7691      3181

    accuracy                         0.6297      5107
   macro avg     0.6528    0.5124    0.4176      5107
weighted avg     0.6469    0.6297    0.5040      5107



In [27]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=3,   # Experiment with different values
    gamma=0.05,           # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5895    0.2565    0.3575      1926
           1     0.6646    0.8919    0.7616      3181

    accuracy                         0.6522      5107
   macro avg     0.6270    0.5742    0.5595      5107
weighted avg     0.6363    0.6522    0.6092      5107



In [29]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=700,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=3,   # Experiment with different values
    gamma=0.1,            # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.5832    0.2965    0.3931      1926
           1     0.6718    0.8717    0.7588      3181

    accuracy                         0.6548      5107
   macro avg     0.6275    0.5841    0.5760      5107
weighted avg     0.6384    0.6548    0.6209      5107



In [69]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=5,   # Experiment with different values
    gamma=0.05,           # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))



              precision    recall  f1-score   support

           0     0.5344    0.0366    0.0686      1910
           1     0.6238    0.9804    0.7625      3112

    accuracy                         0.6215      5022
   macro avg     0.5791    0.5085    0.4155      5022
weighted avg     0.5898    0.6215    0.4986      5022



In [76]:
model = XGBClassifier(
    n_estimators=500,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=4,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5249    0.2644    0.3517      1910
           1     0.6539    0.8531    0.7404      3112

    accuracy                         0.6292      5022
   macro avg     0.5894    0.5588    0.5460      5022
weighted avg     0.6049    0.6292    0.5925      5022



In [80]:
model = XGBClassifier(
    n_estimators=500,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=6,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5347    0.3188    0.3995      1910
           1     0.6649    0.8297    0.7382      3112

    accuracy                         0.6354      5022
   macro avg     0.5998    0.5743    0.5689      5022
weighted avg     0.6154    0.6354    0.6094      5022



In [86]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=800,    # Increased boosting rounds
    learning_rate=0.05,  # Lower learning rate
    max_depth=7,         # Increased maximum depth
    subsample=0.85,      # Experiment with different values
    colsample_bytree=0.85,  # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.5310    0.3047    0.3872      1910
           1     0.6617    0.8348    0.7383      3112

    accuracy                         0.6332      5022
   macro avg     0.5964    0.5698    0.5628      5022
weighted avg     0.6120    0.6332    0.6048      5022



In [93]:
model = XGBClassifier(
    n_estimators=1000,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=6,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5189    0.3738    0.4346      1910
           1     0.6720    0.7873    0.7251      3112

    accuracy                         0.6300      5022
   macro avg     0.5954    0.5805    0.5798      5022
weighted avg     0.6138    0.6300    0.6146      5022



In [102]:
model = XGBClassifier(
    n_estimators=1000,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=5,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,     # Fraction of features used for fitting each tree   # Experiment with different values    
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5168    0.3628    0.4263      1910
           1     0.6694    0.7918    0.7255      3112

    accuracy                         0.6286      5022
   macro avg     0.5931    0.5773    0.5759      5022
weighted avg     0.6113    0.6286    0.6117      5022



In [104]:
model = XGBClassifier(
    n_estimators=500,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=7,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5278    0.3335    0.4087      1910
           1     0.6663    0.8168    0.7339      3112

    accuracy                         0.6330      5022
   macro avg     0.5970    0.5752    0.5713      5022
weighted avg     0.6136    0.6330    0.6103      5022

