In [37]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score 
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

# Part I. Classification on 20newsgroup Data

In [2]:
doc=pd.read_table(r'../data/20newsgroup/documents.txt',header=None)
wordlist=pd.read_table(r'../data/20newsgroup/wordlist.txt',header=None)[0].tolist()
target=pd.read_table(r'../data/20newsgroup/newsgroups.txt',header=None)[0].to_numpy()

In [3]:
data=np.zeros((16242,100),dtype=int)
for index, row in doc.iterrows():
    i,j = row[0],row[1]
    data[i-1,j-1] = 1

In [4]:
datadf=pd.DataFrame(data,columns=wordlist)
datadf

Unnamed: 0,aids,baseball,bible,bmw,cancer,car,card,case,children,christian,...,university,version,video,vitamin,war,water,win,windows,won,world
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16239,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
16240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
X,Xtest,Y,Ytest=train_test_split(datadf,target,test_size=0.1,random_state=5054)

In [9]:
Y

array([4, 4, 1, ..., 3, 4, 1], dtype=int64)

## 1.Build a random forest for this dataset 
and report the 5-fold cross validation value of the misclassification error. 
Note that you need to train the model by yourself, i.e., how many predictors 
are chosen in each tree and how many trees are used. There is no benchmark. Stop tuning when 
you feel appropriate. Report the best CV error, the corresponding confusion matrix and tuning 
parameters. What are the ten most important keywords based on variable importance?

In [20]:
parameters = {'n_estimators': np.arange(
    20, 100, 10), 'max_features': np.arange(0, 10, 1), 'random_state': 5054}
RF = RandomizedSearchCV(RandomForestClassifier(),
                        param_distributions=parameters, cv=5)
RF.fit(X, Y)
print(RF.best_params_)

{'n_estimators': 80, 'max_features': 3}


In [35]:
RF_best = RandomForestClassifier(**RF.best_params_,oob_score=True,random_state=50)
RF_best.fit(X,Y)
top10=[datadf.columns.values[i] for i in (np.argsort(RF_best.feature_importances_)[-1:-11:-1])]
print("best CV error:",1-cross_val_score(RF_best,X,Y,cv=5).mean()) 
print("confusion matrix:\n",confusion_matrix(RF_best.predict(Xtest),Ytest))
print("tuning parameters:",RF.best_params_)
print("ten most important keywords:\n", top10)

best CV error: 0.18745261998218754
confusion matrix:
 [[418  26  53  23]
 [ 16 265  22  18]
 [ 22  19 145  28]
 [ 25  32  35 478]]
tuning parameters: {'n_estimators': 80, 'max_features': 3}
ten most important keywords:
 ['car', 'windows', 'god', 'christian', 'government', 'team', 'jews', 'religion', 'graphics', 'space']


## 2. Build a boosting tree for this dataset 
and report the 5-fold cross validation value of the 
misclassification error. Similarly, report the best CV error,  the corresponding confusion matrix and 
tuning parameters. 

In [39]:
Y0 = Y-1
Y0test = Ytest-1

booster = GridSearchCV(
    xgb.XGBClassifier(max_depth=5, booster='gbtree', objective='multi:softmax'),
    param_grid={'n_estimators': np.arange(20, 100, 10), 'learning_rate': (0.1, 0.01)},
    cv=5)
booster.fit(X, Y0)
print(booster.best_params_)

KeyboardInterrupt: 

In [None]:
booster_best=xgb.train({**booster.best_params_,'booster':'gbtree','objective':'multi:softmax','max_depth':5,'num_class': 4},xgb.DMatrix(X, label=Y0),100)
print(confusion_matrix(booster_best.predict(xgb.DMatrix(Xtest)),Y0test))