In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

from preproces import text_clean

import pickle
import os

In [2]:
df = pd.read_csv('C:/My folders/Deva paul/AI/SEM 5/356 NLP/NLP Project/New folder/FNS2023_Datasets/English/training/out/train_annotated_label_corrected.csv')
df.head()

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary
0,10023,Highlights,37,60,True
1,10023,The Company at a glance,97,88,True
2,10023,Our strategy for growth,185,37,False
3,10023,Chairman’s statement,222,160,True
4,10023,Chief Executive’s report,382,187,True


In [3]:
df.isna().sum()

file_id                  0
toc_section              0
toc_section_pos          0
toc_section_len          0
is_section_in_summary    0
dtype: int64

In [8]:
import nltk

# Download the 'omw-1.4' resource
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Deva\AppData\Roaming\nltk_data...


True

In [9]:
df['toc_section_cleaned'] = df.toc_section.apply(text_clean)
df.head()

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned
0,10023,Highlights,37,60,True,highlight
1,10023,The Company at a glance,97,88,True,compani glanc
2,10023,Our strategy for growth,185,37,False,strategi growth
3,10023,Chairman’s statement,222,160,True,chairman statement
4,10023,Chief Executive’s report,382,187,True,chief execut report


In [12]:
df[df['toc_section'] == 'Highlights']['is_section_in_summary'].value_counts()

True    880
Name: is_section_in_summary, dtype: int64

In [11]:
df[df['toc_section_cleaned'] == 'highlight']['is_section_in_summary'].value_counts()

True     1022
False      47
Name: is_section_in_summary, dtype: int64

In [13]:
toc_section_values = df.toc_section_cleaned.unique()
print('Num of sections', len(toc_section_values))
num = 0
for toc_section_value in toc_section_values:
    value_counts = df[df['toc_section_cleaned'] == toc_section_value]['is_section_in_summary'].value_counts()
    try:
        total_counts = value_counts.at[False] + value_counts.at[True]
        majority_value = True
        if value_counts.at[False] > value_counts.at[True]:
            majority_value = False
        
        per_majority = value_counts.at[majority_value]/total_counts
        
        if per_majority >= 0.7:
            df.loc[df['toc_section_cleaned'] == toc_section_value, 'is_section_in_summary'] = majority_value
        
    except:
        pass
    if num%500 ==0:
        print('Processed: ',toc_section_value, ' ',  num+1)
    num = num + 1

Num of sections 8943
Processed:  highlight   1
Processed:  inform   501
Processed:  note form part financi statement   1001
Processed:  st tement tot al recognis   1501
Processed:  balanc sheet sabmil plc   2001
Processed:  unaudit statement net commerci oil ga   2501
Processed:  oper efcienc   3001
Processed:  acquisit complet year   3501
Processed:  lambert smith hampton   4001
Processed:  technolog updat   4501
Processed:  corpor govern continu   5001
Processed:  bodycot   5501
Processed:  c orpor govern   6001
Processed:  ask forc climat relat   6501
Processed:  resolut   7001
Processed:  commit   7501
Processed:  proven robust busi model   8001
Processed:  complementari channelsservic   8501


In [14]:
label_encoder = LabelEncoder()
df.is_section_in_summary = label_encoder.fit_transform(df.is_section_in_summary)
df.head()

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned
0,10023,Highlights,37,60,1,highlight
1,10023,The Company at a glance,97,88,1,compani glanc
2,10023,Our strategy for growth,185,37,0,strategi growth
3,10023,Chairman’s statement,222,160,1,chairman statement
4,10023,Chief Executive’s report,382,187,1,chief execut report


In [19]:
feature_label_list=['toc_section_cleaned', 'toc_section_pos', 'toc_section_len', 'is_section_in_summary']
feature_list=['toc_section_cleaned', 'toc_section_pos', 'toc_section_len']
categorical_cols = ['toc_section_pos', 'toc_section_len']

In [20]:
os.mkdir('model')

In [21]:
pickle.dump(label_encoder, open('model/label_encoder.pkl', 'wb'))

In [22]:
df = df[feature_label_list]
df.head()

Unnamed: 0,toc_section_cleaned,toc_section_pos,toc_section_len,is_section_in_summary
0,highlight,37,60,1
1,compani glanc,97,88,1
2,strategi growth,185,37,0
3,chairman statement,222,160,1
4,chief execut report,382,187,1


In [23]:
df.toc_section_cleaned.fillna('missing', inplace=True)

In [24]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df.is_section_in_summary)

In [26]:
train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

In [28]:
X_train = train_df[feature_list]
y_train = train_df.is_section_in_summary
X_test = test_df[feature_list]
y_test = test_df.is_section_in_summary

In [29]:
tfidf_vectorizer_toc_section = TfidfVectorizer(decode_error='replace', tokenizer=None, encoding='utf-8', dtype=np.float32,
                                    smooth_idf=True, min_df=2, max_features=200000, binary=True, 
                                    stop_words=None, sublinear_tf=False, norm='l2', analyzer='word', max_df=0.5,
                                    lowercase=True, use_idf=False, ngram_range=(1,2))

In [30]:
X_train_tfidf = tfidf_vectorizer_toc_section.fit_transform(X_train.toc_section_cleaned)
X_train_tfidf_columns = [i + '_' + 'toc' for i in tfidf_vectorizer_toc_section.get_feature_names()]
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=X_train_tfidf_columns)



In [33]:
X_train_tfidf

<55120x4545 sparse matrix of type '<class 'numpy.float32'>'
	with 228112 stored elements in Compressed Sparse Row format>

In [34]:
X_train_new = X_train_tfidf_df.copy()
X_train_new['toc_section_pos'] = X_train.toc_section_pos
X_train_new['toc_section_len'] = X_train.toc_section_len
X_train_new.shape

(55120, 4547)

In [35]:
X_train_new

Unnamed: 0,abacu_toc,abacu group_toc,abbrevi_toc,abcam_toc,abcam stori_toc,aberdeen_toc,abridg_toc,abridg group_toc,abridg miner_toc,ac_toc,...,yell_toc,yet_toc,yet effect_toc,yougov_toc,yougov glanc_toc,yougov plc_toc,yule_toc,yule catto_toc,toc_section_pos,toc_section_len
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8960,51
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3054,1468
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,864,443
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3036,19
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,269,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2915,667
55116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1113,149
55117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2437,160
55118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1634,304


In [36]:
pickle.dump(tfidf_vectorizer_toc_section, open('model/tfidf_vectorizer_toc_section.pkl', 'wb'))

In [37]:
clf = LogisticRegression(C=10.0, fit_intercept=True, intercept_scaling=1.0, penalty='l2')
clf.fit(X_train_new, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=10.0, intercept_scaling=1.0)

In [38]:
pickle.dump(clf, open('model/section_classification_model.pkl', 'wb'))

In [39]:
X_test_tfidf = tfidf_vectorizer_toc_section.transform(X_test.toc_section_cleaned)
X_test_tfidf_columns = [i + '_' + 'toc' for i in tfidf_vectorizer_toc_section.get_feature_names()]
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=X_test_tfidf_columns)



In [40]:
X_test_new = X_test_tfidf_df.copy()
X_test_new['toc_section_pos'] = X_test.toc_section_pos
X_test_new['toc_section_len'] = X_test.toc_section_len
y_pred = clf.predict(X_test_new)

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95     11199
           1       0.89      0.66      0.76      2582

    accuracy                           0.92     13781
   macro avg       0.91      0.82      0.86     13781
weighted avg       0.92      0.92      0.92     13781



In [42]:
y_pred_tain = clf.predict(X_train_new)
train_df_predicted = train_df.copy()
train_df_predicted['pred'] = y_pred_tain
y_pred_prob_train = clf.predict_proba(X_train_new)
y_pred_prob_train_df = pd.DataFrame(y_pred_prob_train,columns=label_encoder.classes_)
train_df_predicted['False'] = y_pred_prob_train_df[False]
train_df_predicted['True'] = y_pred_prob_train_df[True]
train_df_predicted

Unnamed: 0,index,toc_section_cleaned,toc_section_pos,toc_section_len,is_section_in_summary,pred,False,True
0,43161,group statement cash flow,8960,51,0,0,0.989329,0.010671
1,14871,financi statement,3054,1468,1,0,0.979047,0.020953
2,9197,consolid prot,864,443,0,0,0.970450,0.029550
3,1750,group statement,3036,19,0,0,0.914691,0.085309
4,56608,busi glanc,269,69,1,1,0.273944,0.726056
...,...,...,...,...,...,...,...,...
55115,32495,remuner report,2915,667,0,0,0.984776,0.015224
55116,44938,manag servic,1113,149,1,0,0.730850,0.269150
55117,68603,corpor govern statement,2437,160,0,0,0.983832,0.016168
55118,24630,director report,1634,304,0,0,0.998262,0.001738


In [43]:
y_pred_test = clf.predict(X_test_new)
test_df_predicted = test_df.copy()
test_df_predicted['pred'] = y_pred_test
y_pred_prob_test = clf.predict_proba(X_test_new)
y_pred_prob_test_df = pd.DataFrame(y_pred_prob_test,columns=label_encoder.classes_)
test_df_predicted['False'] = y_pred_prob_test_df[False]
test_df_predicted['True'] = y_pred_prob_test_df[True]
test_df_predicted

Unnamed: 0,index,toc_section_cleaned,toc_section_pos,toc_section_len,is_section_in_summary,pred,False,True
0,12217,divers,212,36,0,0,0.757538,0.242462
1,40264,ceo statement,441,54,1,0,0.803030,0.196970
2,3450,year brief,87,68,1,0,0.727340,0.272660
3,29888,financi review,1886,803,0,0,0.981610,0.018390
4,60865,sustain,1121,355,0,0,0.665905,0.334095
...,...,...,...,...,...,...,...,...
13776,68265,group statement cash flow,1415,147,0,0,0.970304,0.029696
13777,57752,director remuner report,959,24,0,0,0.995837,0.004163
13778,21140,north america,1303,310,1,0,0.718165,0.281835
13779,15227,board director,862,37,0,0,0.993679,0.006321


In [44]:
df_predicted = pd.concat([train_df_predicted, test_df_predicted], axis=0)
df_predicted

Unnamed: 0,index,toc_section_cleaned,toc_section_pos,toc_section_len,is_section_in_summary,pred,False,True
0,43161,group statement cash flow,8960,51,0,0,0.989329,0.010671
1,14871,financi statement,3054,1468,1,0,0.979047,0.020953
2,9197,consolid prot,864,443,0,0,0.970450,0.029550
3,1750,group statement,3036,19,0,0,0.914691,0.085309
4,56608,busi glanc,269,69,1,1,0.273944,0.726056
...,...,...,...,...,...,...,...,...
13776,68265,group statement cash flow,1415,147,0,0,0.970304,0.029696
13777,57752,director remuner report,959,24,0,0,0.995837,0.004163
13778,21140,north america,1303,310,1,0,0.718165,0.281835
13779,15227,board director,862,37,0,0,0.993679,0.006321


In [45]:
os.mkdir('out')
pickle.dump(df_predicted, open('out/training_df_predicted.pkl', 'wb'))

In [46]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

In [47]:
y_test_pred_proba = clf.predict_proba(X_test_new)
print('Logistic test roc-auc: {}'.format(roc_auc_score(y_test, y_test_pred_proba[:,1])))
fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba[:,1])
accuracy_ls = []
for thre in thresholds:
    y_pred = np.where(y_test_pred_proba[:,1]>thre,1,0)
    accuracy_ls.append(accuracy_score(y_test, y_pred, normalize=True))

accuracy_ls = pd.concat([pd.Series(thresholds), pd.Series(accuracy_ls)], axis=1)

accuracy_ls.columns = ['threshold', 'accuracy']
accuracy_ls.sort_values(by='accuracy', ascending=False, inplace=True)
accuracy_ls.head()

Logistic test roc-auc: 0.9391880250456687


Unnamed: 0,threshold,accuracy
190,0.707842,0.924679
189,0.708136,0.924679
187,0.708497,0.924679
186,0.709048,0.924679
188,0.708163,0.924606


In [48]:
scores = cross_val_score(clf, X_train_new, y_train, cv=5, verbose=1, n_jobs=-1)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


nan accuracy with a standard deviation of nan


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished
3 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1508, in fit
    X, y = self._validate_data(
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\valida