In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import svm

In [2]:
#load final dataset(all features put together)
final = pd.read_csv('Final.csv')

In [3]:
final.head()

Unnamed: 0,course_id,thread_id,no_of_posts,no_of_comments,no_uni_users,no_of_anonymous_msg,staff_replied,no of msgs,avg_num_words,max_words,avg_resp_time,first_post_day,msg_rate,u_chain,index_longest_post,num_views,votes,votes_Square,index_max_votes,forum_id
0,analysenumerique-001,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,2,4,0,10
1,analysenumerique-001,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
2,analysenumerique-001,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,1,1,0,10
3,analysenumerique-001,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10
4,analysenumerique-001,2,9.0,2.0,5.0,2.0,0.0,11.0,12.454545,31.0,-152871.727273,2,1359260000.0,True,4,277,0,0,0,10


In [4]:
#drop unwanted columns
final = final.drop(['thread_id', 'course_id', 'votes', 'votes_Square'], axis=1)

In [5]:
#dividing data set to testing and training sets
X = final.drop('forum_id', axis=1)
y = final['forum_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 42) #divides df into train and test sets

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(495179, 15)
(243895, 15)
(495179,)
(243895,)


### Default Model:

In [14]:
#creating a default XGBoost Classifier
xgbd = xgb.XGBClassifier()
xgbd.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Fine tuned Model:

In [31]:
#fine tuning the model
xgbm = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=12,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
xgbm.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

Changes Made from default parameters:

- gamma from 0 to 1

- colsample_bytree from 1 to 0.5

- max_depth from 6 to 12

## Results:

### Default Parameters:

In [33]:
from sklearn.metrics import accuracy_score
y_pred = xgbd.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Default model Accuracy: %.2f%%" % (accuracy * 100.0))

Default model Accuracy: 83.31%


### Fine tuned Parameters:

In [34]:
y_predm = xgbm.predict(X_test)
predictionsm = [round(value) for value in y_predm]
# evaluate predictions
accuracym = accuracy_score(y_test, predictionsm)
print("Fine tuned model Accuracy: %.2f%%" % (accuracym * 100.0))

Fine tuned model Accuracy: 96.56%


### Conclusion

The model significantly improved by tweaking the max_depth parameter. It went from 83.31% to 96.56%. This parameter controls the number of different features used in each tree. This meant that adding more features being used in each tree improves the model accuracy.