In [32]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import regex
from scripts.jaxon_scripts.preprocessing import parse_data, extract_paragraphs, extract_headers
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

nlp = spacy.load('en_core_web_lg')
nlp.max_length = 1500000



In [4]:
def extract_headers_paragraphs(df):
    df["soup"] = df["text_tmp"].apply(lambda x: BeautifulSoup(x, "html.parser"))
    df["headers"] = df["soup"].apply(extract_headers)
    df["paragraphs"] = df["soup"].apply(extract_paragraphs)
    df["text_clean"] = df.apply(
        lambda row: f"{''.join(row['headers'])} \n {''.join(row['paragraphs'])} \n {row['text_clean']}",
        axis=1,
    )
    df = df.drop(columns=['headers', 'paragraphs', 'soup', 'Unnamed: 0'])
    return df

def clean_text(text):
  text = text.lower() # lower all the texts

  # text = re.sub(r'http\S+|www.\S+', '', text) # remove all the links

  doc = nlp(text)
  cleaned_tokens = []

  for token in doc:
    if not token.is_punct and not token.is_stop: # remove punctuation and stop words
      cleaned_tokens.append(token.lemma_) # lemmatization

  return ' '.join(cleaned_tokens)

def get_doc_vector(text):
  doc = nlp(text)
  return doc.vector

def tfidf_unigram_vectorization(df):
  df['doc_vector'] = df['text_clean'].apply(get_doc_vector) # use spacy to vectorize contents
  df_doc_vectors = pd.DataFrame(df['doc_vector'].tolist())
  df_doc_vectors.reset_index(drop=True, inplace=True)

  tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=10000)
  tfidf_matrix = tfidf.fit_transform(df['text_clean']) # vectorize the texts
  df_tfidf = pd.DataFrame(tfidf_matrix.toarray(),
                          columns=tfidf.get_feature_names_out()) # construct a new df for features extracted using tfidf
  df_tfidf.reset_index(drop=True, inplace=True)

  df_unigram = pd.concat([df_tfidf, df_doc_vectors], axis=1) # combine spacy vectors and tfidf vectors

  return df_unigram

def tfidf_bigrams_vectorization(df):
  # construct Tf Idf model with maximum 5000 features, capture only bigrams
  tfidf = TfidfVectorizer(ngram_range=(2, 2), max_features=10000)
  tfidf_matrix = tfidf.fit_transform(df['text_clean'])
  df_bigram = pd.DataFrame(tfidf_matrix.toarray(),
                          columns=tfidf.get_feature_names_out()) # construct a new df for features extracted using bigram
  
  return df_bigram

def extract_features(text):
  doc = nlp(text)

  num_tokens = len(doc)
  num_nouns = sum(1 for token in doc if token.pos_ == 'NOUN')
  num_verbs = sum(1 for token in doc if token.pos_ == 'VERB')
  num_adjs = sum(1 for token in doc if token.pos_ == 'ADJ')
  num_entities = len(doc.ents)
  num_person = sum(1 for ent in doc.ents if ent.label_ == 'PERSON')
  num_org = sum(1 for ent in doc.ents if ent.label_ == 'ORG')
  num_gpe = sum(1 for ent in doc.ents if ent.label_ == 'GPE')
  num_sentences = len(list(doc.sents))
  avg_sentence_length = np.mean([len(sent) for sent in doc.sents]) if num_sentences > 0 else 0
  num_urls = len(re.findall(r'http\S+|www\S+', text))

  return {
        "num_tokens": num_tokens,
        "num_nouns": num_nouns,
        "num_verbs": num_verbs,
        "num_adjs": num_adjs,
        "num_entities": num_entities,
        "num_person": num_person,
        "num_org": num_org,
        "num_gpe": num_gpe,
        "num_sentences": num_sentences,
        "avg_sentence_length": avg_sentence_length,
        "num_urls": num_urls
    }

In [17]:
### Preprocessing Script
df_raw = pd.read_csv('../data/claims_clean.csv') # read data

# df_clean = parse_data(df_raw) # extract contents from html
df_clean = extract_headers_paragraphs(df_raw) # extract headers and paragraphs


In [18]:
### Tokenization & Lemmatization
df_clean['text_clean'] = df_clean['text_clean'].apply(clean_text) 

In [9]:
### Vectorization
df_unigrams = tfidf_unigram_vectorization(df_clean) # tfidf: Unigrams
df_bigrams = tfidf_unigram_vectorization(df_clean) # tfidf: Bigrams

In [23]:
y_bclass = df_raw['bclass']
y_mclass = df_raw['mclass']
df_unigrams.columns = df_unigrams.columns.astype(str)
df_bigrams.columns = df_bigrams.columns.astype(str)

In [41]:
X_train_unigrams, X_test_unigrams, y_train_bclass, y_test_bclass = train_test_split(df_unigrams, y_bclass, stratify=y_bclass, test_size=0.3, random_state=197)

### Logistic Principle Components Regression: Unigrams
logreg_pca_clf = Pipeline([('scalar', StandardScaler()),
                           ('pca', PCA()),
                           ('logreg', LogisticRegression(solver='saga',
                                                         penalty='l1',
                                                         max_iter=5000,
                                                         random_state=197))])

logreg_grid_cv = GridSearchCV(logreg_pca_clf, 
                              {'pca__n_components': [50, 100, 200, 300, 500],
                               'logreg__C': [0.01, 0.1, 1, 10, 100]}, 
                              cv=5, 
                              scoring='f1_weighted',
                              n_jobs=-1)

logreg_grid_cv.fit(X_train_unigrams, y_train_bclass)

# get the log-odds of 174 features after pca
logreg_unigrams_best = logreg_grid_cv.best_estimator_
log_odds = logreg_unigrams_best.decision_function(df_unigrams)
df_log_odds = pd.DataFrame(log_odds, columns=['log_odds'])

# get pca of unigrams
pca_unigram = PCA(logreg_grid_cv.best_params_['pca__n_components'])
X_unigram_pca = pca_unigram.fit_transform(df_unigrams)
df_pca_unigrams = pd.DataFrame(X_unigram_pca, columns=[f'pca_unigram_{i}' for i in range(X_unigram_pca.shape[1])])



In [26]:
# df_pca_unigrams.to_csv('../data/X_pca_unigrams.csv', index=False)

In [42]:
X_train_bigrams, X_test_bigrams, y_train_bclass, y_test_bclass = train_test_split(df_bigrams, y_bclass, stratify=y_bclass, test_size=0.3, random_state=197)

### Logistic Principle Components Regression: Unigrams
logreg_pca_clf = Pipeline([('scalar', StandardScaler()),
                           ('pca', PCA()),
                           ('logreg', LogisticRegression(solver='saga',
                                                         penalty='l1',
                                                         max_iter=5000,
                                                         random_state=197))])

logreg_grid_cv = GridSearchCV(logreg_pca_clf, 
                              {'pca__n_components': [50, 100, 200, 300, 500],
                               'logreg__C': [0.01, 0.1, 1, 10, 100]}, 
                              cv=5, 
                              scoring='f1_weighted', 
                              n_jobs=-1)

logreg_grid_cv.fit(X_train_bigrams, y_train_bclass)

# get pca of bigrams
pca_bigrams = PCA(logreg_grid_cv.best_params_['pca__n_components'])
X_bigrams_pca = pca_bigrams.fit_transform(df_bigrams)
df_pca_bigrams = pd.DataFrame(X_bigrams_pca, columns=[f'pca_bigrams_{i}' for i in range(X_bigrams_pca.shape[1])])



In [43]:
df_POS = df_clean['text_clean'].apply(extract_features)

### Concatenate all features
df_features = pd.concat([df_POS, df_pca_unigrams, df_pca_bigrams, df_log_odds], axis=1)
df_features = df_features.drop(columns=['text_clean'])

X_train, X_test, y_train_bclass, y_test_bclass = train_test_split(df_features, y_bclass, stratify=y_bclass, test_size=0.3, random_state=197)

### Logistic Principle Components Regression
logreg_pca_clf = Pipeline([('scalar', StandardScaler()),
                           ('pca', PCA()),
                           ('logreg', LogisticRegression(solver='saga',
                                                         penalty='l2',
                                                         max_iter=5000,
                                                         random_state=197))])

logreg_grid_cv = GridSearchCV(logreg_pca_clf, 
                              {'pca__n_components': [50, 100, 150, 200, 250, 300, 400, 450, 500],
                               'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100]}, 
                              cv=5, 
                              scoring='f1_weighted',
                              n_jobs=-1)

In [44]:
X_train.head()

Unnamed: 0,pca_unigram_0,pca_unigram_1,pca_unigram_2,pca_unigram_3,pca_unigram_4,pca_unigram_5,pca_unigram_6,pca_unigram_7,pca_unigram_8,pca_unigram_9,pca_unigram_10,pca_unigram_11,pca_unigram_12,pca_unigram_13,pca_unigram_14,pca_unigram_15,pca_unigram_16,pca_unigram_17,pca_unigram_18,pca_unigram_19,pca_unigram_20,pca_unigram_21,pca_unigram_22,pca_unigram_23,pca_unigram_24,pca_unigram_25,pca_unigram_26,pca_unigram_27,pca_unigram_28,pca_unigram_29,pca_unigram_30,pca_unigram_31,pca_unigram_32,pca_unigram_33,pca_unigram_34,pca_unigram_35,pca_unigram_36,pca_unigram_37,pca_unigram_38,pca_unigram_39,...,pca_bigrams_461,pca_bigrams_462,pca_bigrams_463,pca_bigrams_464,pca_bigrams_465,pca_bigrams_466,pca_bigrams_467,pca_bigrams_468,pca_bigrams_469,pca_bigrams_470,pca_bigrams_471,pca_bigrams_472,pca_bigrams_473,pca_bigrams_474,pca_bigrams_475,pca_bigrams_476,pca_bigrams_477,pca_bigrams_478,pca_bigrams_479,pca_bigrams_480,pca_bigrams_481,pca_bigrams_482,pca_bigrams_483,pca_bigrams_484,pca_bigrams_485,pca_bigrams_486,pca_bigrams_487,pca_bigrams_488,pca_bigrams_489,pca_bigrams_490,pca_bigrams_491,pca_bigrams_492,pca_bigrams_493,pca_bigrams_494,pca_bigrams_495,pca_bigrams_496,pca_bigrams_497,pca_bigrams_498,pca_bigrams_499,log_odds
1399,0.14377,0.021761,0.334757,-0.052748,-0.359876,-0.059881,-0.08674,-0.037649,-0.076151,-0.20816,-0.031777,-0.068252,-0.101144,-0.042978,-0.039656,0.058458,0.085597,-0.002647,0.032444,-0.01403,0.002535,0.183291,-0.228761,-0.048227,0.061764,-0.098722,-0.035638,0.043198,-0.00975,0.003044,-0.064662,0.022681,0.028559,0.020491,-0.016353,-0.09898,-0.001438,-0.025997,0.027246,-0.008381,...,-0.048137,0.02226,0.013549,-0.021695,-0.008624,-0.010209,0.009333,0.056848,0.04836,0.037704,0.00779,-0.018789,-0.056626,-0.00734,0.029107,-0.030569,0.012756,0.041291,0.019403,-0.046639,0.04323,-0.00755,0.006101,0.004048,-0.02157,-0.053426,-0.045199,0.003957,0.048429,-0.004697,-0.031505,0.079777,-0.048608,-0.024121,0.004775,-0.00454,-0.055017,0.068907,-0.038325,1.12289
810,1.014193,0.650175,-0.810739,-0.432737,0.510579,0.257711,0.41972,-0.704564,-0.283096,-0.424603,-0.423608,0.299818,-0.250524,-0.084016,-0.092922,-0.06617,-0.029683,-0.003192,0.130163,-0.009336,0.034981,0.074429,-0.023362,-0.056013,-0.033992,-0.030065,0.058296,-0.103729,0.097854,-0.16888,0.052644,-0.037727,0.07584,-0.02523,0.132827,0.000537,0.00524,-0.042019,-0.08612,-0.044886,...,-0.005956,0.002102,0.002938,-0.001294,0.008062,0.001423,-0.001874,-0.008534,0.011205,-0.012575,-0.000836,-0.000679,0.004524,-0.007291,0.006446,-0.016154,-0.008952,0.001529,0.001268,0.004541,-0.012551,0.004804,-0.00496,0.001227,0.010224,-0.007528,0.002595,0.008054,-0.008248,0.004127,0.008297,-0.00901,-0.007588,-0.00512,-0.009848,-0.003437,0.008591,0.003189,0.005574,0.8751
380,0.438736,-0.36498,-1.450496,-0.742925,-0.985758,-0.2192,0.407037,-0.080868,-0.135363,0.862196,0.368198,0.304471,-0.029362,-0.222132,-0.025991,0.009432,0.010211,0.002344,0.079738,-0.03512,0.002059,0.135361,-0.022616,-0.026983,-0.013801,0.064022,0.066022,-0.040885,-0.036912,-0.041513,-0.029415,-0.003515,-0.029061,0.008069,-0.03261,-0.016834,0.030944,0.045358,-0.005487,-0.04348,...,-0.004809,0.005374,-0.000275,0.004142,0.003745,-0.003281,-8.4e-05,-0.004203,0.006696,-0.004691,0.003353,0.001479,0.001305,0.004289,0.002082,0.005142,-0.001223,0.005228,0.002353,0.006159,0.001222,0.000597,-0.000635,-0.001974,-0.001172,0.004867,0.003061,-0.001298,0.002687,0.001828,0.00148,0.004393,-0.002702,0.003731,0.001171,-0.005278,-0.001968,-0.00231,0.001164,1.368829
1857,0.158307,-0.109818,-0.580149,-0.119075,0.03625,0.067032,-0.032986,-0.157882,-0.014882,-0.339794,0.121,-0.086459,0.217297,-0.069518,0.096395,0.198042,0.142816,0.000709,-0.004595,0.186183,-0.009914,0.025301,0.124835,-0.051517,-0.016324,0.046386,-0.121323,-0.102268,-0.045364,0.095417,-0.123275,-0.122605,-0.142969,0.054802,0.01028,0.134133,0.051421,-0.025244,-0.062142,0.037074,...,-0.016848,0.009103,0.061097,0.021255,-0.000286,-0.011258,0.035603,0.03674,-0.022521,-0.002727,-0.005435,-0.012802,-0.017915,0.036843,0.03495,-0.040837,-0.000242,-0.028165,0.028161,-0.006323,-0.006903,0.00242,-0.00837,-0.025598,-0.028656,0.003266,0.043498,0.010977,0.039331,-0.062793,0.010537,-0.054107,0.008672,-0.032118,0.021021,0.022251,-0.03171,0.011282,0.018712,-3.571587
2030,-0.718041,-0.536753,0.145944,-0.419451,0.139133,-0.012717,-0.102847,0.039802,0.011742,-0.038327,-0.061968,0.078876,-0.001293,0.011729,-0.014526,-0.004415,0.109976,0.005098,0.070303,-0.077762,-0.017733,0.046149,0.035013,0.051368,-0.078824,-0.070865,-0.048537,0.051728,-0.07174,-0.163874,0.034691,-0.118822,-0.039557,0.00359,-0.000787,0.012097,0.055169,-0.066366,0.038315,-0.017981,...,-0.02463,0.01393,-0.020626,-0.010655,0.015173,0.03363,-0.026449,-0.069083,0.077801,-0.023208,0.028772,-0.006143,-0.052434,0.001471,0.001354,0.000578,0.007455,-0.000798,-0.006194,0.032569,0.015205,-0.005064,0.038318,0.031505,0.00515,-0.00624,-0.043739,0.028796,-0.022969,-0.014047,0.019583,-0.011141,0.042284,0.008368,-0.004024,0.005147,-0.031691,-0.001058,-0.012301,2.244645


In [45]:
logreg_grid_cv.fit(X_train, y_train_bclass)

In [52]:
best_logreg_bclass = logreg_grid_cv.best_estimator_
print(f"Weighted Training F1 Score is {logreg_grid_cv.best_score_}")

Weighted Training F1 Score is 0.8068913450949079


In [None]:

y_bclass_pred = best_logreg_bclass.predict(X_test)

In [54]:
print(f"Weighted Testing F1 Score is {f1_score(y_test_bclass, y_bclass_pred, average='weighted')}")

Weighted Testing F1 Score is 0.8074038144744611
