In [32]:
import numpy as np
import pandas as pd
import re
import spacy
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load('en_core_web_lg')


In [28]:
def extract_features(text):
  doc = nlp(text)

  num_tokens = len(doc)
  num_nouns = sum(1 for token in doc if token.pos_ == 'NOUN')
  num_verbs = sum(1 for token in doc if token.pos_ == 'VERB')
  num_adjs = sum(1 for token in doc if token.pos_ == 'ADJ')
  num_entities = len(doc.ents)
  num_person = sum(1 for ent in doc.ents if ent.label_ == 'PERSON')
  num_org = sum(1 for ent in doc.ents if ent.label_ == 'ORG')
  num_gpe = sum(1 for ent in doc.ents if ent.label_ == 'GPE')
  num_sentences = len(list(doc.sents))
  avg_sentence_length = np.mean([len(sent) for sent in doc.sents]) if num_sentences > 0 else 0
  num_urls = len(re.findall(r'http\S+|www\S+', text))

  return {
        "num_tokens": num_tokens,
        "num_nouns": num_nouns,
        "num_verbs": num_verbs,
        "num_adjs": num_adjs,
        "num_entities": num_entities,
        "num_person": num_person,
        "num_org": num_org,
        "num_gpe": num_gpe,
        "num_sentences": num_sentences,
        "avg_sentence_length": avg_sentence_length,
        "num_urls": num_urls
    }

In [49]:
df_model = pd.read_csv('../data/data_2_after_tokenization_lemmatization.csv')

y_bclass = df_model['bclass']
y_mclass = df_model['mclass']
X = pd.read_csv('../data/X_features_unigrams.csv')

In [50]:
X_train, X_test, y_train_bclass, y_test_bclass = train_test_split(X, y_bclass, stratify=y_bclass, test_size=0.3, random_state=197)

In [51]:
logreg_pca_clf = Pipeline([('scalar', StandardScaler()),
                           ('pca', PCA()),
                           ('logreg', LogisticRegression())])

logreg_pca_clf_param_grid = {
  'pca__n_components': list(range(165, 176)),
  'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100],
  'logreg__penalty': ['l2'],
  'logreg__solver': ['lbfgs'],
  'logreg__max_iter': [2000]
}

logreg_pca_clf_grid_search = GridSearchCV(logreg_pca_clf, 
                                          logreg_pca_clf_param_grid, 
                                          cv=5, 
                                          scoring='roc_auc',
                                          n_jobs=-1)

logreg_pca_clf_grid_search.fit(X_train, y_train_bclass)

In [52]:
logreg_pca_clf_grid_search.best_params_

{'logreg__C': 0.001,
 'logreg__max_iter': 2000,
 'logreg__penalty': 'l2',
 'logreg__solver': 'lbfgs',
 'pca__n_components': 174}

In [53]:
logreg_pca_clf_grid_search.best_score_

np.float64(0.711857153184414)

In [36]:
X_bigrams = pd.read_csv('../data/X_features_bigrams.csv')

In [37]:
X_train_bigrams, X_test_bigrams, y_train_bclass, y_test_bclass = train_test_split(X_bigrams, y_bclass, stratify=y_bclass, test_size=0.3, random_state=197)

In [41]:
logreg_pca_bigrams = Pipeline([('scalar', StandardScaler()),
                           ('pca', PCA()),
                           ('logreg', LogisticRegression())])

logreg_pca_bigrams_param_grid = {
  'pca__n_components': [420, 430, 440, 450],
  'logreg__C': [0.001],
  'logreg__penalty': ['l2'],
  'logreg__solver': ['lbfgs'],
  'logreg__max_iter': [2000]
}

logreg_pca_bigrams_grid_search = GridSearchCV(logreg_pca_bigrams, 
                                          logreg_pca_bigrams_param_grid, 
                                          cv=5, 
                                          scoring='roc_auc',
                                          n_jobs=-1)

logreg_pca_bigrams_grid_search.fit(X_train_bigrams, y_train_bclass)

In [42]:
logreg_pca_bigrams_grid_search.best_score_

np.float64(0.6988458639845219)

In [43]:
df_numerical_features = df_model['contents'].apply(extract_features)

In [48]:
# df_numerical_features = pd.DataFrame(df_numerical_features.tolist())
df_numerical_features.columns

Index(['num_tokens', 'num_nouns', 'num_verbs', 'num_adjs', 'num_entities',
       'num_person', 'num_org', 'num_gpe', 'num_sentences',
       'avg_sentence_length', 'num_urls'],
      dtype='object')

In [61]:
# get the log-odds of 174 features after pca
logreg_unigrams_best = logreg_pca_clf_grid_search.best_estimator_
log_odds = logreg_unigrams_best.decision_function(X)
df_log_odds = pd.DataFrame(log_odds, columns=['log_odds'])

# get pca of unigrams
pca_unigram = PCA(n_components=174)
X_unigram_pca = pca_unigram.fit_transform(X)
df_pca_unigrams = pd.DataFrame(X_unigram_pca, columns=[f'pca_unigram_{i}' for i in range(X_unigram_pca.shape[1])])

# get pca of bigrams
pca_bigrams = PCA(n_components=420)
X_bigrams_pca = pca_bigrams.fit_transform(X_bigrams)
df_pca_bigrams = pd.DataFrame(X_bigrams_pca, columns=[f'pca_bigrams_{i}' for i in range(X_bigrams_pca.shape[1])])

# concat all features
X_final = pd.concat([df_log_odds, df_numerical_features, df_pca_unigrams, df_pca_bigrams], axis=1)

In [62]:
X_final.head()

Unnamed: 0,log_odds,num_tokens,num_nouns,num_verbs,num_adjs,num_entities,num_person,num_org,num_gpe,num_sentences,...,pca_bigrams_410,pca_bigrams_411,pca_bigrams_412,pca_bigrams_413,pca_bigrams_414,pca_bigrams_415,pca_bigrams_416,pca_bigrams_417,pca_bigrams_418,pca_bigrams_419
0,1.485371,1352,204,132,54,122,45,19,13,46,...,0.008315,-0.011338,-0.017934,0.008206,-0.011437,-0.018298,0.003874,0.002859,-0.009669,0.00121
1,1.242634,264,51,43,17,13,4,3,0,9,...,-0.002698,0.006969,0.00066,-0.003778,0.000522,0.005129,0.001642,-0.002138,-0.00076,-0.001706
2,0.897688,149,18,11,9,21,0,1,11,6,...,0.014958,-0.009567,0.004271,0.008919,-0.004607,0.005887,-0.000525,-0.015707,-0.002877,-0.004151
3,2.461021,631,162,76,30,23,4,8,3,17,...,0.001984,-0.006148,0.008776,0.009879,0.003081,0.008282,0.007659,-0.002245,-0.003031,0.00253
4,1.979074,298,66,31,13,14,3,1,5,8,...,-0.007754,-0.010576,-0.033742,-0.001999,0.010815,0.007058,0.01238,-0.009616,-0.002946,-0.011945


In [63]:
X_final.to_csv('../data/X_features_final.csv', index=False)