In [1]:
# in this file we are goona work on the text data, extract features from text and build models
# Now our data cleaning has almost been done. It's time to extract more features :
# 1. n-grams
# 2. tf-idf
# 3. bag of words


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# let's do some moedelling
from sklearn.model_selection import train_test_split, cross_val_score

# packages for metric for evalaution of the models
from sklearn import metrics
from sklearn.metrics import classification_report , accuracy_score, confusion_matrix


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# As tf–idf is very often used for text features, there is also another class called TfidfVectorizer that combines 
# all the options of CountVectorizer and TfidfTransformer in a single model.
# so we can calculate tf-idf and the CountVectorizer in one go with below module
from sklearn.feature_extraction.text import TfidfVectorizer

# use multiple models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC

import seaborn as sns



In [2]:
df = pd.read_csv(r'datasets\1a-crest-after-text-cleaning.csv')

In [3]:
df.head(5)

Unnamed: 0,title,collection,collection_labels,title_word_count,avg_word_len
0,briefing office administration fy executive hu...,General_CIA_Records,0,46,6.434783
1,classified committee request richard,General_CIA_Records,0,23,8.782609
2,shipyard present polish school information 2 g...,General_CIA_Records,0,37,7.513514
3,military military military supply depot,General_CIA_Records,0,20,8.2
4,material material material system material mat...,General_CIA_Records,0,19,8.263158


In [9]:
# calculating tf-idf using scikitlearn
# sublinear_df  - is set to True to use a logarithmic form for frequency.
# min_df - is the minimum numbers of documents a word must be present in to be kept.
# norm - is set to l2, to ensure all our feature vectors have a euclidian norm of 1.
# ngram_range -  is set to (1, 2) to indicate that we want to consider both unigrams and bigrams.
# stop_words - is set to "english" to remove all common pronouns ("a", "the", ...) to reduce the number of noisy features.


tfidf = TfidfVectorizer(sublinear_tf=True, min_df=50, norm='l2', encoding='latin-1', ngram_range=(1, 3))

features = tfidf.fit_transform(df.title.values.astype('str')).toarray()
labels = df.collection_labels

In [10]:
features.shape

(75000, 2975)

In [11]:
features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
# convert features ndarray into pandas dataframe so that it can be merged with 
#other features like avg_word_count and avg_word_len
features_df = pd.DataFrame(features)

In [26]:
features_df.tail(7)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2965,2966,2967,2968,2969,2970,2971,2972,2973,2974
74993,0.142613,0.0,0.0,0.0,0.0,0.214982,0.21821,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df_word = df[['title_word_count' , 'avg_word_len']]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df_word, labels, random_state = 0)

In [16]:
# let's try how ensemling ( Random forest wit 10 decision trees ) performs 
random_forest = RandomForestClassifier(n_estimators=10)
random_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
print("Accuracy = ")
print(random_forest.score(X_test, y_test))
print("\n")
y_pred_rf = random_forest.predict(X_test)
print(classification_report(y_test, y_pred_rf, target_names=df['collection'].unique()))

Accuracy = 
0.6025066666666666


                           precision    recall  f1-score   support

      General_CIA_Records       0.41      0.39      0.40      3652
              NGA_Records       0.45      0.45      0.45      3667
     Scientific_Abstracts       0.60      0.65      0.63      3816
Consolidated_Translations       0.84      0.79      0.81      3808
                     Misc       0.69      0.72      0.71      3807

                micro avg       0.60      0.60      0.60     18750
                macro avg       0.60      0.60      0.60     18750
             weighted avg       0.60      0.60      0.60     18750



In [18]:
df_word.shape

(75000, 2)

In [19]:
type(df_word)

pandas.core.frame.DataFrame

In [20]:
df_word.head(5)

Unnamed: 0,title_word_count,avg_word_len
0,46,6.434783
1,23,8.782609
2,37,7.513514
3,20,8.2
4,19,8.263158


In [21]:
final_features = pd.concat([features_df, df_word], axis=1)

In [22]:
final_features.shape

(75000, 2977)

In [23]:
final_features.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2967,2968,2969,2970,2971,2972,2973,2974,title_word_count,avg_word_len
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46,6.434783
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23,8.782609
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37,7.513514
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20,8.2
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,8.263158


In [24]:
X_train, X_test, y_train, y_test = train_test_split(final_features, labels, random_state = 0)

In [None]:
# Multinomial Naive Bayes
#clf = MultinomialNB().fit(X_train_tfidf, y_train)
#y_pred = clf.predict(X_test)
#print(classification_report(y_test, y_pred))


In [26]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# define cross validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df.groupby('model_name').accuracy.mean()


model_name
LinearSVC                 0.856560
LogisticRegression        0.851667
MultinomialNB             0.798640
RandomForestClassifier    0.614640
Name: accuracy, dtype: float64

In [27]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.591933
1,RandomForestClassifier,1,0.596267
2,RandomForestClassifier,2,0.643267
3,RandomForestClassifier,3,0.657733
4,RandomForestClassifier,4,0.584
5,LinearSVC,0,0.853
6,LinearSVC,1,0.8588
7,LinearSVC,2,0.859467
8,LinearSVC,3,0.859
9,LinearSVC,4,0.852533


In [None]:
# since LinearSVM is working pretty goood
# let's try different kernals and see which one is the best
models = [
    #LinearSVC(),
    SVC(kernel='poly' , degree= 2),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid')
]

# define cross validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df.groupby('model_name').accuracy.mean()


In [None]:
cv_df

In [None]:
# it can be seen that LinearSVC and Logistic Regression working better than MultinomialNB and RandomForestClassifier
# let's try boosting and ensembling with top 3 performers

In [7]:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, 
                                                                                 df.index, test_size=0.25, random_state=0)


In [None]:
# let's have our own ensembles method
# a combination 

# votoing hard means votes on the labels not on the probabilities
ensemble_vc = VotingClassifier( estimators=[ ('adaboost', adaboost) , 
                                            ('random_forest', random_forest) , 
                                            ('bg', bg)] , voting='hard' )

In [None]:
svm_poly_2 = SVC(kernel='poly' , degree= 2)
svm_poly_2.fit(X_train, y_train)

In [None]:
# trying different kinds of SVM, 
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train, y_train)

In [None]:
# trying different kinds of SVM, 
svm_sigmoid = SVC(kernel='sigmoid')
svm_sigmoid.fit(X_train, y_train)

In [9]:
# chosing Linear SVC and working on it
linear_svm = LinearSVC()
linear_svm.fit(X_train, y_train)
#fig, ax = plt.subplots(figsize=(10,10))
#sns.heatmap(conf_mat, annot=True, fmt='d',
#            xticklabels=collection_labels_df.collection.values, yticklabels=collection_labels_df.collection.values)
#plt.ylabel('Actual')
#plt.xlabel('Predicted')
#plt.show()

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [10]:
y_pred = linear_svm.predict(X_test)

#conf_mat = confusion_matrix(y_test, y_pred)


print(metrics.classification_report(y_test, y_pred, 
                                    target_names=df['collection'].unique()))

                           precision    recall  f1-score   support

      General_CIA_Records       0.73      0.79      0.76      3652
              NGA_Records       0.90      0.89      0.89      3667
     Scientific_Abstracts       0.90      0.87      0.88      3816
Consolidated_Translations       0.86      0.89      0.88      3808
                     Misc       0.92      0.86      0.89      3807

              avg / total       0.86      0.86      0.86     18750

