In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# ml models decistion tree
from sklearn.tree import DecisionTreeClassifier

# One of the Ensemle model -- Random Forset which is ensemple of decision trees
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier

# import logistic regression, SVC  for ensembling
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB

# split the data into test and train
from sklearn.model_selection import train_test_split

# packages for metric for evalaution of the models
from sklearn.metrics import classification_report , accuracy_score
from sklearn import metrics
# to save the models so that we don't need to train the models every time we need to do prediction
from sklearn.externals import joblib



In [2]:
df_numeric = pd.read_csv(r'datasets\1b-crest-after-numeric-preprocessing.csv')
df_text = pd.read_csv(r'datasets\1a-crest-after-text-cleaning.csv')

In [3]:
df_numeric.columns

Index(['collection', 'document_number', 'release_decision',
       'document_page_count', 'sequence_number', 'publication_date',
       'content_type', 'collection_labels', 'publication_day',
       'publication_month', 'publication_year', 'sub_department',
       'sub_department_num', 'check_digit', 'release_decision_num',
       'content_type_num'],
      dtype='object')

In [5]:
df_numeric.head(5)

Unnamed: 0,collection,document_number,release_decision,document_page_count,sequence_number,publication_date,content_type,collection_labels,publication_day,publication_month,publication_year,sub_department,sub_department_num,check_digit,release_decision_num,content_type_num
0,General_CIA_Records,CIA-RDP88G01332R001301470016-9,RIPPUB,3.0,16,1986-10-31,MEMO,0,31,10,1986,RDP,1,9,0.508693,0.064947
1,General_CIA_Records,CIA-RDP94B00280R001200040002-0,RIPPUB,10.0,2,1983-06-22,MEMO,0,22,6,1983,RDP,1,0,0.508693,0.064947
2,General_CIA_Records,CIA-RDP80-00810A002500690001-1,RIPPUB,4.0,1,1953-11-03,REPORT,0,3,11,1953,RDP,1,1,0.508693,0.159347
3,General_CIA_Records,CIA-RDP82-00457R008500360004-9,RIPPUB,2.0,4,1951-08-13,REPORT,0,13,8,1951,RDP,1,9,0.508693,0.159347
4,General_CIA_Records,CIA-RDP70-00211R000100070047-1,RIPPUB,2.0,47,1953-01-05,REPORT,0,5,1,1953,RDP,1,1,0.508693,0.159347


In [4]:
df_text.columns

Index(['title', 'collection', 'collection_labels', 'title_word_count',
       'avg_word_len'],
      dtype='object')

In [6]:
# stack the DataFrames on top of each other/ Merge datasets row -wise 
df1 = df_numeric[['document_page_count', 'sequence_number', 'collection_labels', 'publication_day', 'publication_month', 'publication_year',
       'check_digit', 'release_decision_num', 'content_type_num', 'sub_department_num']]

df2 = df_text[['title_word_count', 'avg_word_len']]

# concat column wise so axis would 1
df = pd.concat([df1, df2], axis=1)
df.shape

(75000, 12)

In [7]:
df.to_csv(r'datasets\5a_CREST_text_and_numeric_features.csv' , index=False)

In [8]:
df.columns

Index(['document_page_count', 'sequence_number', 'collection_labels',
       'publication_day', 'publication_month', 'publication_year',
       'check_digit', 'release_decision_num', 'content_type_num',
       'sub_department_num', 'title_word_count', 'avg_word_len'],
      dtype='object')

Unnamed: 0,document_page_count,sequence_number,collection_labels,publication_day,publication_month,publication_year,check_digit,release_decision_num,content_type_num,sub_department_num,title_word_count,avg_word_len
74995,3.0,13,4,26,10,1993,1,0.508693,0.009067,1,29,6.413793
74996,22.0,1,4,5,1,1982,9,0.059973,0.159347,1,28,8.0
74997,4.0,9,4,3,12,1993,1,0.508693,0.02,1,19,6.631579
74998,3.0,66,4,31,10,1974,6,0.019733,0.057867,0,42,6.357143
74999,44.0,10,4,4,10,1976,2,0.508693,0.089453,1,32,6.9375


In [38]:
    # use resample method from scikit-learn
    from sklearn.utils import resample
   
df_for_production = resample(df, replace=True,    # sample with replacement
                                 n_samples=5,     # to match number of values in each class
                                 random_state=123) # reproducible results

df_for_production.to_csv(r'datasets\CREST_5_samples_for_production.csv' , index=False)

In [11]:
X = df[['document_page_count', 'sequence_number', 'publication_day', 'publication_month', 'publication_year',
       'check_digit', 'release_decision_num', 'content_type_num',
       'sub_department_num', 'title_word_count', 'avg_word_len']]

y = df[['collection_labels']]

In [12]:
X.head(2)

Unnamed: 0,document_page_count,sequence_number,publication_day,publication_month,publication_year,check_digit,release_decision_num,content_type_num,sub_department_num,title_word_count,avg_word_len
0,3.0,16,31,10,1986,9,0.508693,0.064947,1,46,6.434783
1,10.0,2,22,6,1983,0,0.508693,0.064947,1,23,8.782609


In [14]:
y.head(2)

Unnamed: 0,collection_labels
0,0
1,0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2 , random_state = 4)

In [17]:
# let's try how ensemling ( Random forest wit 10 decision trees ) performs 
random_forest = RandomForestClassifier(n_estimators=10)
random_forest.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
# save the  random model
filepath = r'saved-models-joblib\trained-text-and-numeric-features\random_forest_model'
joblib.dump(random_forest ,filepath )
# load model for prediction
random_forest = joblib.load(filepath)

In [20]:
print(" Random Forest Accuracy = ")
print(random_forest.score(X_test, y_test))
print("\n")
y_pred_rf = random_forest.predict(X_test)
print(classification_report(y_test, y_pred_rf, target_names=df_numeric['collection'].unique()))

 Random Forest Accuracy = 
0.9695333333333334


                           precision    recall  f1-score   support

      General_CIA_Records       0.93      0.93      0.93      2995
              NGA_Records       0.94      0.95      0.94      2984
     Scientific_Abstracts       1.00      1.00      1.00      3011
Consolidated_Translations       1.00      1.00      1.00      2990
                     Misc       0.98      0.97      0.98      3020

                micro avg       0.97      0.97      0.97     15000
                macro avg       0.97      0.97      0.97     15000
             weighted avg       0.97      0.97      0.97     15000



In [21]:
# Random Forest is performing better than Decision tree.Ensemling improved precision and F1-score.
# let's try bagging classifier
# We will give 20 Decisions Trees, each with 50% of training dataset and 100% features
bg = BaggingClassifier( DecisionTreeClassifier() , max_samples = 0.5 , max_features= 1.0 , n_estimators=20 )
bg.fit(X_train , y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.5, n_estimators=20, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [22]:
# save the baggign classifier model
filepath = r'saved-models-joblib\trained-text-and-numeric-features\BaggingClassifier_model'
joblib.dump(bg ,filepath )
# load model for prediction
bg = joblib.load(filepath)

In [24]:
print(" Bagging Classifier Accuracy = ")
print(bg.score(X_test, y_test))
print("\n")

y_pred = bg.predict(X_test)
print(classification_report(y_test, y_pred, target_names=df_numeric['collection'].unique()))

 Bagging Classifier Accuracy = 
0.9712


                           precision    recall  f1-score   support

      General_CIA_Records       0.94      0.92      0.93      2995
              NGA_Records       0.93      0.96      0.95      2984
     Scientific_Abstracts       1.00      1.00      1.00      3011
Consolidated_Translations       1.00      1.00      1.00      2990
                     Misc       0.98      0.98      0.98      3020

                micro avg       0.97      0.97      0.97     15000
                macro avg       0.97      0.97      0.97     15000
             weighted avg       0.97      0.97      0.97     15000



In [25]:
# let's see the performance of BoostingClassifier -ADABoost
# 10 decision trees , learning rate will shrink the contribution of each individual learner 
adaboost = AdaBoostClassifier( DecisionTreeClassifier() , n_estimators= 10 , learning_rate= 1 )
adaboost.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1, n_estimators=10, random_state=None)

In [26]:
# save the adaboost classifier model
filepath = r'saved-models-joblib\trained-text-and-numeric-features\adaboost_model'
joblib.dump(adaboost ,filepath )
# load model for prediction
adaboost = joblib.load(filepath)

In [42]:
print("ADA Boost model Accuracy = ")
print(adaboost.score(X_test , y_test))

print("\n")

y_pred_adaboost = adaboost.predict(X_test)
print(classification_report(y_test , y_pred_adaboost, target_names=df_numeric['collection'].unique()))

ADA Boost model Accuracy = 
0.9626


                           precision    recall  f1-score   support

      General_CIA_Records       0.92      0.90      0.91      2995
              NGA_Records       0.92      0.93      0.93      2984
     Scientific_Abstracts       1.00      1.00      1.00      3011
Consolidated_Translations       1.00      1.00      1.00      2990
                     Misc       0.97      0.98      0.97      3020

                micro avg       0.96      0.96      0.96     15000
                macro avg       0.96      0.96      0.96     15000
             weighted avg       0.96      0.96      0.96     15000



In [44]:
# let's have our own ensembles method

# votoing hard means votes on the labels not on the probabilities
ensemble_vc = VotingClassifier( estimators=[ ('adaboost', adaboost) , 
                                            ('random_forest', random_forest) , 
                                            ('bg', bg)] , voting='hard' )

In [45]:
ensemble_vc.fit(X_train , y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


VotingClassifier(estimators=[('adaboost', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_sa...imators=20, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [46]:
# save the custom ensembling classifier model 
filepath = r'saved-models-joblib\trained-text-and-numeric-features\ensemble_voting_classifier_model'
joblib.dump(ensemble_vc ,filepath )
# load model for prediction
ensemble_vc = joblib.load(filepath)

In [47]:
print("Custom Voting Classifier Accuracy = ")
print(ensemble_vc.score(X_test, y_test))
print("\n")

# print the metrices for our ensemmbled model
y_pred_ensemble_vc = ensemble_vc.predict(X_test)
print( classification_report(y_test , y_pred_ensemble_vc, target_names=df_numeric['collection'].unique()))

Custom Voting Classifier Accuracy = 
0.9733333333333334


                           precision    recall  f1-score   support

      General_CIA_Records       0.94      0.93      0.94      2995
              NGA_Records       0.94      0.96      0.95      2984
     Scientific_Abstracts       1.00      1.00      1.00      3011
Consolidated_Translations       1.00      1.00      1.00      2990
                     Misc       0.98      0.98      0.98      3020

                micro avg       0.97      0.97      0.97     15000
                macro avg       0.97      0.97      0.97     15000
             weighted avg       0.97      0.97      0.97     15000

