In [51]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
train_url = '/kaggle/input/bangla-dataset/train_corr.csv'
test_url = '/kaggle/input/bangla-dataset/test_corr.csv'
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
stop_words_df = pd.read_excel('/kaggle/input/bangla-stopwords/stopwords_bangla.xlsx',index_col=False)

In [52]:
STOPWORDS = set([word.strip() for word in stop_words_df['words']])

In [53]:
import re
def preprocess(x):
    html_pattern = re.compile('<.*?>')
    x = html_pattern.sub(r'', x)
    x = " ".join([word for word in str(x).split() if word not in STOPWORDS])
    return x
df_train['Comment'] = df_train['Comment'].apply(lambda x: preprocess(x))
df_test['Comment'] = df_test['Comment'].apply(lambda x:preprocess(x))

In [54]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(df_train.Error)
Test_Y = Encoder.fit_transform(df_test.Error)

In [55]:
df_all  = pd.concat([df_train, df_test], ignore_index=True)
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(df_all['Comment'])
Train_X_Tfidf = Tfidf_vect.transform(df_train['Comment'])
Test_X_Tfidf = Tfidf_vect.transform(df_test['Comment'])

In [56]:
model = XGBClassifier(
    n_estimators=100,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=3,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

In [57]:
model.fit(Train_X_Tfidf, Train_Y)

In [59]:
y_pred = model.predict(Test_X_Tfidf)

In [21]:
print(metrics.classification_report(Test_Y, y_pred,digits = 4))

              precision    recall  f1-score   support

           0     0.5222    0.0738    0.1294      1910
           1     0.6277    0.9585    0.7586      3112

    accuracy                         0.6221      5022
   macro avg     0.5750    0.5162    0.4440      5022
weighted avg     0.5876    0.6221    0.5193      5022



In [60]:
model = XGBClassifier(
    n_estimators=100,  # Number of boosting rounds (trees)
    learning_rate=0.01,  # Step size shrinkage to prevent overfitting
    max_depth=3,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred,digits = 4))

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      1910
           1     0.6197    1.0000    0.7652      3112

    accuracy                         0.6197      5022
   macro avg     0.3098    0.5000    0.3826      5022
weighted avg     0.3840    0.6197    0.4742      5022



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
df_all  = pd.concat([df_train, df_test], ignore_index=True)
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(df_all['Comment'])
Train_X_Tfidf = Tfidf_vect.transform(df_train['Comment'])
Test_X_Tfidf = Tfidf_vect.transform(df_test['Comment'])

In [62]:
model = XGBClassifier(
    n_estimators=100,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=3,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred,digits = 4))

              precision    recall  f1-score   support

           0     0.5222    0.0738    0.1294      1910
           1     0.6277    0.9585    0.7586      3112

    accuracy                         0.6221      5022
   macro avg     0.5750    0.5162    0.4440      5022
weighted avg     0.5876    0.6221    0.5193      5022



In [63]:
model = XGBClassifier(
    n_estimators=200,  # Number of boosting rounds (trees)
    learning_rate=0.05,  # Step size shrinkage to prevent overfitting
    max_depth=4,        # Maximum depth of each tree
    subsample=0.9,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.9,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred,digits = 4))

              precision    recall  f1-score   support

           0     0.5408    0.0832    0.1443      1910
           1     0.6297    0.9566    0.7594      3112

    accuracy                         0.6245      5022
   macro avg     0.5852    0.5199    0.4519      5022
weighted avg     0.5959    0.6245    0.5255      5022



In [64]:

# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,      # Increased boosting rounds
    learning_rate=0.05,    # Lower learning rate
    max_depth=5,           # Increased maximum depth
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,         # L1 regularization term
    reg_lambda=0.1,        # L2 regularization term
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5458    0.1403    0.2232      1910
           1     0.6376    0.9283    0.7560      3112

    accuracy                         0.6286      5022
   macro avg     0.5917    0.5343    0.4896      5022
weighted avg     0.6027    0.6286    0.5534      5022



In [35]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,      # Increased boosting rounds
    learning_rate=0.01,    # Lower learning rate
    max_depth=4,           # Increased maximum depth
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,         # L1 regularization term
    reg_lambda=0.1,        # L2 regularization term
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5596    0.0319    0.0604      1910
           1     0.6237    0.9846    0.7636      3112

    accuracy                         0.6223      5022
   macro avg     0.5916    0.5083    0.4120      5022
weighted avg     0.5993    0.6223    0.4962      5022



In [32]:

# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=3,   # Experiment with different values
    gamma=0.05,           # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5214    0.1340    0.2132      1910
           1     0.6350    0.9245    0.7528      3112

    accuracy                         0.6239      5022
   macro avg     0.5782    0.5293    0.4830      5022
weighted avg     0.5918    0.6239    0.5476      5022



In [65]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=1,   # Experiment with different values
    gamma=0.1,            # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5491    0.1435    0.2275      1910
           1     0.6383    0.9277    0.7563      3112

    accuracy                         0.6294      5022
   macro avg     0.5937    0.5356    0.4919      5022
weighted avg     0.6044    0.6294    0.5551      5022



In [49]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.005,  # Lower learning rate
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=1,   # Experiment with different values
    gamma=0.15,           # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7083    0.0089    0.0176      1910
           1     0.6212    0.9978    0.7657      3112

    accuracy                         0.6217      5022
   macro avg     0.6648    0.5033    0.3917      5022
weighted avg     0.6544    0.6217    0.4812      5022



In [50]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=3,   # Experiment with different values
    gamma=0.05,           # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5214    0.1340    0.2132      1910
           1     0.6350    0.9245    0.7528      3112

    accuracy                         0.6239      5022
   macro avg     0.5782    0.5293    0.4830      5022
weighted avg     0.5918    0.6239    0.5476      5022



In [67]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=2,   # Experiment with different values
    gamma=0.1,            # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.5169    0.2084    0.2970      1910
           1     0.6444    0.8805    0.7442      3112

    accuracy                         0.6249      5022
   macro avg     0.5806    0.5444    0.5206      5022
weighted avg     0.5959    0.6249    0.5741      5022



In [69]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_child_weight=5,   # Experiment with different values
    gamma=0.05,           # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))



              precision    recall  f1-score   support

           0     0.5344    0.0366    0.0686      1910
           1     0.6238    0.9804    0.7625      3112

    accuracy                         0.6215      5022
   macro avg     0.5791    0.5085    0.4155      5022
weighted avg     0.5898    0.6215    0.4986      5022



In [76]:
model = XGBClassifier(
    n_estimators=500,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=4,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5249    0.2644    0.3517      1910
           1     0.6539    0.8531    0.7404      3112

    accuracy                         0.6292      5022
   macro avg     0.5894    0.5588    0.5460      5022
weighted avg     0.6049    0.6292    0.5925      5022



In [80]:
model = XGBClassifier(
    n_estimators=500,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=6,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5347    0.3188    0.3995      1910
           1     0.6649    0.8297    0.7382      3112

    accuracy                         0.6354      5022
   macro avg     0.5998    0.5743    0.5689      5022
weighted avg     0.6154    0.6354    0.6094      5022



In [86]:
# Updated XGBoost parameters
model = XGBClassifier(
    n_estimators=800,    # Increased boosting rounds
    learning_rate=0.05,  # Lower learning rate
    max_depth=7,         # Increased maximum depth
    subsample=0.85,      # Experiment with different values
    colsample_bytree=0.85,  # Experiment with different values
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.5310    0.3047    0.3872      1910
           1     0.6617    0.8348    0.7383      3112

    accuracy                         0.6332      5022
   macro avg     0.5964    0.5698    0.5628      5022
weighted avg     0.6120    0.6332    0.6048      5022



In [93]:
model = XGBClassifier(
    n_estimators=1000,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=6,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5189    0.3738    0.4346      1910
           1     0.6720    0.7873    0.7251      3112

    accuracy                         0.6300      5022
   macro avg     0.5954    0.5805    0.5798      5022
weighted avg     0.6138    0.6300    0.6146      5022



In [102]:
model = XGBClassifier(
    n_estimators=1000,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=5,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,     # Fraction of features used for fitting each tree   # Experiment with different values    
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5168    0.3628    0.4263      1910
           1     0.6694    0.7918    0.7255      3112

    accuracy                         0.6286      5022
   macro avg     0.5931    0.5773    0.5759      5022
weighted avg     0.6113    0.6286    0.6117      5022



In [104]:
model = XGBClassifier(
    n_estimators=500,  # Number of boosting rounds (trees)
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=7,        # Maximum depth of each tree
    subsample=0.8,      # Fraction of samples used for fitting each tree
    colsample_bytree=0.8,  # Fraction of features used for fitting each tree
    random_state=42
)

model.fit(Train_X_Tfidf, Train_Y)
y_pred = model.predict(Test_X_Tfidf)

print(metrics.classification_report(Test_Y, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.5278    0.3335    0.4087      1910
           1     0.6663    0.8168    0.7339      3112

    accuracy                         0.6330      5022
   macro avg     0.5970    0.5752    0.5713      5022
weighted avg     0.6136    0.6330    0.6103      5022

