In [1]:
from config import *
from utils import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

#matrices and vectorization
from sklearn.metrics import f1_score, accuracy_score, classification_report, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

#models 
from sklearn.svm import SVC
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
# from sklearn import column_transformer 
from sklearn.pipeline import Pipeline
import pickle

In [2]:

seed = pre_process_config['seed']
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)   

In [3]:
#read data 
df = pd.read_csv(list_dir['data_dir'],encoding='utf-8')
input_data=df['Title_description']
targets =df[label_['label_list']]



## Data preporcessing steps 
 1. Remove all the shaffling of the data 
 2. remove the number 
 3. remove the punctuation
 4. remove the stop words
 5. remove the prenthesis 


In [4]:

input_data,targets = shuffle_data(input_data,targets) 


In [5]:
input_data[55]

'বোনাস দিতে পারবে আমরা টেকনোলজিস: পুঁজিবাজারে আইটি খাতে তালিকাভুক্ত কোম্পানি আমরা টেকনোলজিসের ঘোষিত বোনাস লভ্যাংশ প্রদানে সম্মতি দিয়েছে নিয়ন্ত্রক সংস্থা বাংলাদেশ সিকিউরিটিজ অ্যান্ড এক্সচেঞ্জ কমিশন (বিএসইসি)।ঢাকা স্টক এক্সচেঞ্জ (ডিএসই) সূত্রে এ তথ্য জানা গেছে।তথ্য মতে, আমরা টেকনোলজিসের ২০২২ সালের ৩০ জুনে সমাপ্ত হিসাব বছরের আর্থিক প্রতিবেদন পর্যালোচনা করে শেয়ারহোল্ডারদের জন্য ৬ শতাংশ বোনাস লভ্যাংশ ঘোষণা করে।আইন অনুযায়ী বিএসইসির অনুমোদন ছাড়া কোনো কোম্পানি বোনাস শেয়ার ইস্যু করতে পারবে না। তাই বোনাস শেয়ার ঘোষণার পর এই কোম্পানিটি তা বিতরণে নিয়ন্ত্রক সংস্থার সম্মতির জন্য আবেদন করে। নিয়ন্ত্রক সংস্থা কোম্পানিটিকে বোনাস লভ্যাংশ বিতরণে সম্মতি দিয়েছে।'

In [6]:
list_stopwords = pre_process_config['stop_word']
main_stopwords =list_stopwords.split(" ")

In [7]:
#remove stopwords
def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in main_stopwords])
    return text

input_data = input_data.apply(lemmatize_text)
input_data= input_data.apply(remove_prone)
input_data = input_data.apply(remove_numbers)
input_data = input_data.apply(remove_english_words)
input_data = input_data.apply(remove_stopwords)
input_data = input_data.apply(remove_perenthesis)
input_data = input_data.apply(remove_dash)
input_data = input_data.apply(remove_extra_spaces)
input_data = input_data.apply(remove_3rdbreket)
input_data = input_data.apply(remove_symbols) 




In [8]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std()) 

In [9]:

X_train, X_test, y_train, y_test = train_test_split(input_data, targets, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
X_val = vectorizer.transform(X_val)




# model slecetion with cross validation 

In [None]:

clf = OneVsRestClassifier(LinearSVC(random_state=0))
#make 10 fold cross validation 
display_scores(cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy"))

'''
    Scores: [0.79620853 0.81042654 0.79810427 0.79336493 0.81119545 0.79791271
    0.79886148 0.79316888 0.80929791 0.82827324]
    Mean: 0.80368139428222
    Standard deviation: 0.010481708200214835 
'''

In [10]:
#svm classifier    
clf = OneVsRestClassifier(svm.SVC(kernel='linear', C=5, gamma=1))
#make 10 fold cross validation
clf=display_scores(cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy"))

'''
    Scores: [0.80473934 0.81516588 0.80379147 0.80663507 0.82637571 0.80929791
    0.80550285 0.80265655 0.81499051 0.84345351]
    Mean: 0.8132608793402699
    Standard deviation: 0.012185016433768013
    
'''


Scores: [0.87677725 0.86255924 0.87582938 0.86635071 0.88614801 0.89184061
 0.85673624 0.89278937 0.86907021 0.88709677]
Mean: 0.8765197802098978
Standard deviation: 0.01204050243526496


'\n    Scores: [0.80473934 0.81516588 0.80379147 0.80663507 0.82637571 0.80929791\n    0.80550285 0.80265655 0.81499051 0.84345351]\n    Mean: 0.8132608793402699\n    Standard deviation: 0.012185016433768013\n    \n'

In [9]:
#dt classifier 
clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=0))
#10 fold cross validation
clf=display_scores(cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy"))
#display_scores(cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy"))
'''
    Scores: [0.83033175 0.81706161 0.82748815 0.8        0.82352941 0.83206831
    0.80645161 0.80075901 0.80929791 0.84535104]
    Mean: 0.8192338822090524
    Standard deviation: 0.014264177653053867

'''


Scores: [0.88815166 0.90521327 0.8957346  0.90616114 0.90702087 0.89278937
 0.88709677 0.89658444 0.88804554 0.88994307]
Mean: 0.8956740739408439
Standard deviation: 0.007483810624972298


'\n    Scores: [0.83033175 0.81706161 0.82748815 0.8        0.82352941 0.83206831\n    0.80645161 0.80075901 0.80929791 0.84535104]\n    Mean: 0.8192338822090524\n    Standard deviation: 0.014264177653053867\n\n'

In [10]:
#logistic regression classifier 
clf = OneVsRestClassifier(LogisticRegression(random_state=0))
#make 10 fold cross validation
display_scores(cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy"))

'''
    Scores: [0.75165877 0.75924171 0.75734597 0.73080569 0.74383302 0.73529412
    0.73339658 0.73529412 0.72106262 0.76375712]
    Mean: 0.7431689703858917
    Standard deviation: 0.01347878794175505

'''

Scores: [0.78862559 0.7971564  0.78862559 0.78957346 0.80645161 0.82163188
 0.77324478 0.81214421 0.80550285 0.81878558]
Mean: 0.8001741953469969
Standard deviation: 0.014599020633185453


'\n    Scores: [0.75165877 0.75924171 0.75734597 0.73080569 0.74383302 0.73529412\n    0.73339658 0.73529412 0.72106262 0.76375712]\n    Mean: 0.7431689703858917\n    Standard deviation: 0.01347878794175505\n\n'

In [None]:
#naive bayes classifier 
clf = OneVsRestClassifier(MultinomialNB())
#10 fold cross validation
display_scores(cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy"))

'''
    Scores: [0.50521327 0.5478673  0.53175355 0.50047393 0.54079696 0.5370019
    0.51612903 0.51328273 0.49240987 0.53605313]
    Mean: 0.5220981681160463
    Standard deviation: 0.018086532757600096

'''


In [None]:
#knn classifier 
clf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=3))
display_scores(cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy"))

'''
    Scores: [0.72890995 0.75165877 0.72511848 0.71279621 0.74762808 0.72296015
    0.72390892 0.71157495 0.71157495 0.75142315]
    Mean: 0.7287553621050928
    Standard deviation: 0.01519843741387436

'''


In [None]:
#gradient boosting classifier 
clf = OneVsRestClassifier(GradientBoostingClassifier(random_state=0))
display_scores(cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy"))


'''
    Scores: [0.83696682 0.83127962 0.82559242 0.82369668 0.82827324 0.83965844
    0.81024668 0.81119545 0.81688805 0.85009488]
    Mean: 0.8273892281266579
    Standard deviation: 0.01207050517222523
'''



In [None]:
#make adaboost classifier 
clf = OneVsRestClassifier(AdaBoostClassifier(random_state=0))
display_scores(cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy"))
'''
    Scores: [0.80947867 0.7943128  0.78672986 0.77156398 0.80455408 0.79222011
    0.79411765 0.78842505 0.79601518 0.80740038]
    Mean: 0.7944817755874709
    Standard deviation: 0.01059659213112092

'''

#### summary 

##### Model evaluated 
Logistic Regression ,2. XGBoost , 3. SVM ,4. Naive Bayes ,5. KNN ,6. Decision Tree,7. AdaBoost ,8. Gradient Boosting ,9. Voting Classifier 
Model Evaluation base on Accuracy 
best performing model (accuracy >= 0.8)--> gradient boosting ,2. svm ,3. dt ,4. adaboost 
 


# hyper parameter tuning 


In [None]:
# desion tree classifier parameters 
from sklearn.model_selection import GridSearchCV   
clf = DecisionTreeClassifier(random_state=0) 
param_grid = [{ 
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    
}]
#make grid search
CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 10)
CV_clf.fit(X_train, y_train)


#best parameters 
#print(CV_clf.best_params_) -->DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=0)


In [11]:
dt_clf_best = OneVsRestClassifier(DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=0))
dt_clf_best.fit(X_train, y_train)

OneVsRestClassifier(estimator=DecisionTreeClassifier(criterion='entropy',
                                                     max_depth=20,
                                                     random_state=0))

In [14]:
#store the best model ckpt 
import joblib
joblib.dump(dt_clf_best, r'C:\Users\Amzad\Desktop\news_classifcation\project\logs\model_ckpts\dt_clf_best.pkl')


['C:\\Users\\Amzad\\Desktop\\news_classifcation\\project\\logs\\model_ckpts\\dt_clf_best.pkl']

In [None]:
#make predictions
y_pred = dt_clf_best.predict(X_test)
y_pred = pd.DataFrame(y_pred, columns=targets.columns)
y_pred.head()

#save predictions

In [None]:
# Use default parameters, and train and test with small set of samples.
svm_clf = OneVsRestClassifier(SVC())
param_grid = {'estimator__C': [0.1, 1, 10, 100, 1000],  
              'estimator__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'estimator__kernel': ['rbf'],
              'estimator__class_weight': ['balanced', None]}  



#make grid search
svm_clf_best = GridSearchCV(svm_clf, param_grid, cv=5, n_jobs=-1)
svm_clf_best.fit(X_train, y_train)


'''
{'estimator__C': 10, 'estimator__gamma': 1, 'estimator__kernel': 'rbf'}
0.8566952151392379
OneVsRestClassifier(estimator=SVC(C=10, gamma=1))

'''

In [15]:
#train with best parameters 
svm_clf_best = OneVsRestClassifier(estimator=SVC(C=10, gamma=1))
svm_clf_best.fit(X_train, y_train)

OneVsRestClassifier(estimator=SVC(C=10, gamma=1))

In [16]:
joblib.dump(svm_clf_best, r'C:\Users\Amzad\Desktop\news_classifcation\project\logs\model_ckpts\svm_clf_best.pkl')

['C:\\Users\\Amzad\\Desktop\\news_classifcation\\project\\logs\\model_ckpts\\svm_clf_best.pkl']

In [None]:
from sklearn.model_selection import GridSearchCV 
ada_clf = OneVsRestClassifier(AdaBoostClassifier(random_state=0))
param_grid = [{'estimator__n_estimators': [50, 100, 200, 300 ],
                'estimator__learning_rate': [0.1, 0.2, 0.3,],
                'estimator__algorithm': ['SAMME', 'SAMME.R'],
                'estimator__random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
            }]

#make grid search

gs_clf = GridSearchCV(ada_clf, param_grid, cv=4, n_jobs=-1)
gs_clf.fit(X_train, y_train)
''' best parameters
OneVsRestClassifier(estimator=AdaBoostClassifier(learning_rate=0.3,
                                                 n_estimators=300,
                                                 random_state=0))

'''




In [17]:
#train with best parameters 
gs_clf = OneVsRestClassifier(AdaBoostClassifier(learning_rate=0.3,
                                                 n_estimators=300,
                                                 random_state=0))
gs_clf.fit(X_train, y_train)



OneVsRestClassifier(estimator=AdaBoostClassifier(learning_rate=0.3,
                                                 n_estimators=300,
                                                 random_state=0))

In [18]:
joblib.dump(gs_clf, r'C:\Users\Amzad\Desktop\news_classifcation\project\logs\model_ckpts\gs_clf_best.pkl')

['C:\\Users\\Amzad\\Desktop\\news_classifcation\\project\\logs\\model_ckpts\\gs_clf_best.pkl']

In [None]:
#Make gradient boosting classifier with best parameters
'''
this model takes a long time to train, so tune it once with small set of parameters, and then never tune it again

'''
gb_clf = OneVsRestClassifier(GradientBoostingClassifier(random_state=0))
param_grid = [{
    'estimator__n_estimators': [300, 400, 500],
    'estimator__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
}]

#make grid search
gs_clf = GridSearchCV(gb_clf, param_grid, cv=5, n_jobs=-1)
gs_clf.fit(X_train, y_train)

#never tune it again 
#best parameters are {'estimator__max_depth': 10, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 500}


In [19]:
#train with best parameters 
gb_clf_best= OneVsRestClassifier(GradientBoostingClassifier(random_state=0, max_depth=10, n_estimators=500, min_samples_leaf=1, min_samples_split=2))
gb_clf_best.fit(X_train, y_train)

OneVsRestClassifier(estimator=GradientBoostingClassifier(max_depth=10,
                                                         n_estimators=500,
                                                         random_state=0))

In [20]:
joblib.dump(gb_clf_best, r'C:\Users\Amzad\Desktop\news_classifcation\project\logs\model_ckpts\gb_clf_best.pkl')

['C:\\Users\\Amzad\\Desktop\\news_classifcation\\project\\logs\\model_ckpts\\gb_clf_best.pkl']

In [None]:
#make voting classifier with best parameters 
from sklearn.ensemble import VotingClassifier
voting_clf = OneVsRestClassifier(VotingClassifier( estimators=[('svm', svm_clf_best), ('dt', dt_clf_best), ('gb', gb_clf_best), ('ada', gs_clf)], voting='soft'))
voting_clf.fit(X_train, y_train)
#make predictions on validation set 

In [None]:
# dump the model to ml_model_ckpt_dir 
import pickle
pickle.dump(voting_clf, open('C:\Users\Amzad\Desktop\news_classifcation\project\logs\model_ckpts\voting_clf.pkl', 'wb'))

In [None]:
from utils import voting_with_model 


#make predictions on validation set
y_pred = voting_with_model([svm_clf_best, dt_clf_best, gb_clf_best, gs_clf], X_val[45])
y_pred = pd.DataFrame(y_pred, columns=targets.columns)


#compare with actual labels
print(y_val.iloc[45])
print(y_pred)



#make predictions on validation set
y_pred = voting_with_model([svm_clf_best, dt_clf_best, gb_clf_best, gs_clf], X_val)
y_pred = pd.DataFrame(y_pred, columns=targets.columns)

#evaluate the model 


In [None]:
#test on test set    
y_pred = voting_with_model([svm_clf_best, dt_clf_best, gb_clf_best, gs_clf], X_test)
y_pred = pd.DataFrame(y_pred, columns=targets.columns)
y_pred.head()

#print the accuracy function 
