In [34]:
import pandas as pd
import numpy as np
import spacy
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

In [35]:
# df_clean = pd.read_csv('../data/claims_clean.csv')
# df_raw = pd.read_csv('../data/claims_raw.csv')

In [36]:
# df_raw.isnull().sum()

In [37]:
# df_clean.isnull().sum()

# HTML Texts Extraction

In [38]:
# df_raw['soup'] = df_raw['text_tmp'].apply(lambda x: BeautifulSoup(x, 'html.parser'))

In [39]:
# def extract_headers(soup):
#   headers = []
#   for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
#     for header in soup.find_all(header_tag):
#       headers.append(header.get_text(strip=True))
#   headers = '. '.join(headers)
#   return headers

# def extract_paragraphs(soup):
#   contents = []
#   for paragraph in soup.find_all('p'):
#     contents.append(paragraph.get_text(strip=True))
#   contents = '. '.join(contents)
#   return contents
  

In [40]:
# df_raw['headers'] = df_raw['soup'].apply(extract_headers)
# df_raw['paragraphs'] = df_raw['soup'].apply(extract_paragraphs)

In [41]:
# df_temp = pd.concat((df_clean['text_clean'], df_raw[['headers', 'paragraphs']]), axis=1) # concat h, p, and clean texts
# df_temp = df_temp.apply(lambda row: '\n'.join(row.values.astype(str)), axis=1) # join them to a single cell
# df_model = pd.concat((df_temp, df_raw[['mclass', 'bclass']]), axis = 1) # add y to the df

In [42]:
# df_model.rename(columns={0: 'contents'}, inplace=True)
# df_model.head(5)

# Cleaning the Texts Columns

In [43]:
df_model = pd.read_csv('../data/data_clean.csv')

In [44]:
nlp = spacy.load('en_core_web_lg')

In [45]:
def clean_text(text):
  text = text.lower() # lower all the texts

  # text = re.sub(r'http\S+|www.\S+', '', text) # remove all the links

  doc = nlp(text)
  cleaned_tokens = []

  for token in doc:
    if not token.is_punct and not token.is_stop: # remove punctuation and stop words
      cleaned_tokens.append(token.lemma_) # lemmatization

  return ' '.join(cleaned_tokens)

### Takes long time

In [46]:
df_model['cleaned_text'] = df_model['contents'].apply(clean_text)

KeyboardInterrupt: 

In [None]:
df_model.head(5)

Unnamed: 0.1,Unnamed: 0,contents,mclass,bclass
0,0,national obituary search click on the item you...,Possible Fatality,Relevant claim content
1,1,the following official arrest record for jonat...,Potentially unlawful activity,Relevant claim content
2,2,did someone you know get arrested in miami dad...,N/A: No relevant content.,N/A: No relevant content.
3,3,the information on this website is taken from ...,Potentially unlawful activity,Relevant claim content
4,4,name clayton thomas location memphis tennessee...,Potentially unlawful activity,Relevant claim content


# Vectorize the tokens

In [None]:
def get_doc_vector(text):
  doc = nlp(text)
  return doc.vector

### Take long time

In [None]:
df_model['doc_vector'] = df_model['cleaned_text'].apply(get_doc_vector)

KeyError: 'cleaned_text'

# TF-IDF

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df_model['cleaned_text'])
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(),
                        columns=tfidf.get_feature_names_out())

KeyError: 'cleaned_text'

In [None]:
df_doc_vectors = pd.DataFrame(df_model['doc_vector'].tolist())

df_tfidf.reset_index(drop=True, inplace=True)
df_doc_vectors.reset_index(drop=True, inplace=True)

df_combined_vectors = pd.concat([df_tfidf, df_doc_vectors], axis=1)

# Extract Linguistic Features

In [None]:
def extract_features(text):
  doc = nlp(text)

  num_tokens = len(doc)
  num_nouns = sum(1 for token in doc if token.pos_ == 'NOUN')
  num_verbs = sum(1 for token in doc if token.pos_ == 'VERB')
  num_adjs = sum(1 for token in doc if token.pos_ == 'ADJ')
  num_entities = len(doc.ents)
  num_person = sum(1 for ent in doc.ents if ent.label_ == 'PERSON')
  num_org = sum(1 for ent in doc.ents if ent.label_ == 'ORG')
  num_gpe = sum(1 for ent in doc.ents if ent.label_ == 'GPE')
  num_sentences = len(list(doc.sents))
  avg_sentence_length = np.mean([len(sent) for sent in doc.sents]) if num_sentences > 0 else 0
  num_urls = len(re.findall(r'http\S+|www\S+', text))

  return {
        "num_tokens": num_tokens,
        "num_nouns": num_nouns,
        "num_verbs": num_verbs,
        "num_adjs": num_adjs,
        "num_entities": num_entities,
        "num_person": num_person,
        "num_org": num_org,
        "num_gpe": num_gpe,
        "num_sentences": num_sentences,
        "avg_sentence_length": avg_sentence_length,
        "num_urls": num_urls
    }



In [None]:
df_linguistic_features = pd.DataFrame(df_model['cleaned_text'].apply(extract_features).tolist())
df_linguistic_features.head(5)

NameError: name 'df_model' is not defined