In [7]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import regex
from preprocessing import parse_data, extract_paragraphs, extract_headers
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

nlp = spacy.load('en_core_web_lg')
nlp.max_length = 1500000



In [4]:
def extract_headers_paragraphs(df):
    df["soup"] = df["text_tmp"].apply(lambda x: BeautifulSoup(x, "html.parser"))
    df["headers"] = df["soup"].apply(extract_headers)
    df["paragraphs"] = df["soup"].apply(extract_paragraphs)
    df["text_clean"] = df.apply(
        lambda row: f"{''.join(row['headers'])} \n {''.join(row['paragraphs'])} \n {row['text_clean']}",
        axis=1,
    )
    df = df.drop(columns=['headers', 'paragraphs', 'soup', 'Unnamed: 0'])
    return df

def clean_text(text):
  text = text.lower() # lower all the texts

  # text = re.sub(r'http\S+|www.\S+', '', text) # remove all the links

  doc = nlp(text)
  cleaned_tokens = []

  for token in doc:
    if not token.is_punct and not token.is_stop: # remove punctuation and stop words
      cleaned_tokens.append(token.lemma_) # lemmatization

  return ' '.join(cleaned_tokens)

def get_doc_vector(text):
  doc = nlp(text)
  return doc.vector

def tfidf_unigram_vectorization(df):
  df['doc_vector'] = df['text_clean'].apply(get_doc_vector) # use spacy to vectorize contents
  df_doc_vectors = pd.DataFrame(df['doc_vector'].tolist())
  df_doc_vectors.reset_index(drop=True, inplace=True)

  tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=10000)
  tfidf_matrix = tfidf.fit_transform(df['text_clean']) # vectorize the texts
  df_tfidf = pd.DataFrame(tfidf_matrix.toarray(),
                          columns=tfidf.get_feature_names_out()) # construct a new df for features extracted using tfidf
  df_tfidf.reset_index(drop=True, inplace=True)

  df_unigram = pd.concat([df_tfidf, df_doc_vectors], axis=1) # combine spacy vectors and tfidf vectors

  return df_unigram

def tfidf_bigrams_vectorization(df):
  # construct Tf Idf model with maximum 5000 features, capture only bigrams
  tfidf = TfidfVectorizer(ngram_range=(2, 2), max_features=10000)
  tfidf_matrix = tfidf.fit_transform(df['text_clean'])
  df_bigram = pd.DataFrame(tfidf_matrix.toarray(),
                          columns=tfidf.get_feature_names_out()) # construct a new df for features extracted using bigram
  
  return df_bigram

def extract_features(text):
  doc = nlp(text)

  num_tokens = len(doc)
  num_nouns = sum(1 for token in doc if token.pos_ == 'NOUN')
  num_verbs = sum(1 for token in doc if token.pos_ == 'VERB')
  num_adjs = sum(1 for token in doc if token.pos_ == 'ADJ')
  num_entities = len(doc.ents)
  num_person = sum(1 for ent in doc.ents if ent.label_ == 'PERSON')
  num_org = sum(1 for ent in doc.ents if ent.label_ == 'ORG')
  num_gpe = sum(1 for ent in doc.ents if ent.label_ == 'GPE')
  num_sentences = len(list(doc.sents))
  avg_sentence_length = np.mean([len(sent) for sent in doc.sents]) if num_sentences > 0 else 0
  num_urls = len(re.findall(r'http\S+|www\S+', text))

  return {
        "num_tokens": num_tokens,
        "num_nouns": num_nouns,
        "num_verbs": num_verbs,
        "num_adjs": num_adjs,
        "num_entities": num_entities,
        "num_person": num_person,
        "num_org": num_org,
        "num_gpe": num_gpe,
        "num_sentences": num_sentences,
        "avg_sentence_length": avg_sentence_length,
        "num_urls": num_urls
    }

In [17]:
### Preprocessing Script
df_raw = pd.read_csv('../data/claims_clean.csv') # read data

# df_clean = parse_data(df_raw) # extract contents from html
df_clean = extract_headers_paragraphs(df_raw) # extract headers and paragraphs


In [18]:
### Tokenization & Lemmatization
df_clean['text_clean'] = df_clean['text_clean'].apply(clean_text) 

In [9]:
### Vectorization
df_unigrams = tfidf_unigram_vectorization(df_clean) # tfidf: Unigrams
df_bigrams = tfidf_unigram_vectorization(df_clean) # tfidf: Bigrams

In [23]:
y_bclass = df_raw['bclass']
y_mclass = df_raw['mclass']
df_unigrams.columns = df_unigrams.columns.astype(str)
df_bigrams.columns = df_bigrams.columns.astype(str)

In [24]:
X_train_unigrams, X_test_unigrams, y_train_bclass, y_test_bclass = train_test_split(df_unigrams, y_bclass, stratify=y_bclass, test_size=0.3, random_state=197)

### Logistic Principle Components Regression: Unigrams
logreg_pca_clf = Pipeline([('scalar', StandardScaler()),
                           ('pca', PCA()),
                           ('logreg', LogisticRegression(solver='saga',
                                                         penalty='l2',
                                                         max_iter=5000,
                                                         random_state=197))])

logreg_grid_cv = GridSearchCV(logreg_pca_clf, 
                              {'pca__n_components': [50, 100, 200, 300, 500],
                               'logreg__C': [0.01, 0.1, 1, 10, 100]}, 
                              cv=5, 
                              scoring='f1_weighted',
                              n_jobs=-1)

logreg_grid_cv.fit(X_train_unigrams, y_train_bclass)

# get the log-odds of 174 features after pca
logreg_unigrams_best = logreg_grid_cv.best_estimator_
log_odds = logreg_unigrams_best.decision_function(df_unigrams)
df_log_odds = pd.DataFrame(log_odds, columns=['log_odds'])

# get pca of unigrams
pca_unigram = PCA(logreg_grid_cv.best_params_['pca__n_components'])
X_unigram_pca = pca_unigram.fit_transform(df_unigrams)
df_pca_unigrams = pd.DataFrame(X_unigram_pca, columns=[f'pca_unigram_{i}' for i in range(X_unigram_pca.shape[1])])



In [27]:
logreg_grid_cv.best_params_['pca__n_components']

50

In [26]:
df_pca_unigrams.to_csv('../data/X_pca_unigrams.csv', index=False)

In [25]:
X_train_bigrams, X_test_bigrams, y_train_bclass, y_test_bclass = train_test_split(df_bigrams, y_bclass, stratify=y_bclass, test_size=0.3, random_state=197)

### Logistic Principle Components Regression: Unigrams
logreg_pca_clf = Pipeline([('scalar', StandardScaler()),
                           ('pca', PCA()),
                           ('logreg', LogisticRegression(solver='saga',
                                                         penalty='l2',
                                                         max_iter=5000,
                                                         random_state=197))])

logreg_grid_cv = GridSearchCV(logreg_pca_clf, 
                              {'pca__n_components': [50, 100, 200, 300, 500],
                               'logreg__C': [0.01, 0.1, 1, 10, 100]}, 
                              cv=5, 
                              scoring='f1_weighted', 
                              n_jobs=-1)

logreg_grid_cv.fit(X_train_bigrams, y_train_bclass)

# get pca of bigrams
pca_bigrams = PCA(logreg_grid_cv.best_params_['pca__n_components'])
X_bigrams_pca = pca_bigrams.fit_transform(df_bigrams)
df_pca_bigrams = pd.DataFrame(X_bigrams_pca, columns=[f'pca_bigrams_{i}' for i in range(X_bigrams_pca.shape[1])])



In [None]:
df_POS = df_clean['text_clean'].apply(extract_features)

### Concatenate all features
df_features = pd.concat([df_POS, df_pca_unigrams, df_pca_bigrams, df_log_odds], axis=1)

X_train, X_test, y_train_bclass, y_test_bclass = train_test_split(df_features, y_bclass, stratify=y_bclass, test_size=0.3, random_state=197)

### Logistic Principle Components Regression
logreg_pca_clf = Pipeline([('scalar', StandardScaler()),
                           ('pca', PCA()),
                           ('logreg', LogisticRegression(solver='saga',
                                                         penalty='l2',
                                                         max_iter=5000,
                                                         random_state=197))])

logreg_grid_cv = GridSearchCV(logreg_pca_clf, 
                              {'pca__n_components': [50, 100, 150, 200, 250, 300, 400, 450, 500],
                               'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100]}, 
                              cv=5, 
                              scoring='f1_weighted',
                              n_jobs=-1)

logreg_grid_cv.fit(X_train, y_train_bclass)


In [None]:
best_logreg_bclass = logreg_grid_cv.best_estimator_
y_bclass_pred = best_logreg_bclass.predict(X_test)
print(f"Weighted F1 Score is {f1_score(y_bclass, y_bclass_pred, average='weighted')}")
