In [None]:
import os
os.chdir(r"D:\PythonApps\exercise_reddit_titles")
os.getcwd()

# Imports

In [None]:
import pandas as pd
import json
import pprint
import numpy as np
from IPython.core.display import HTML
from IPython.display import display
from tqdm import tqdm

import spacy
from transformers import pipeline

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# DataFrame

In [None]:
df = pd.read_csv("resources/raw_dataset.csv", index_col=0)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Functions to extract features

In [None]:
def syntactical_extraction(text: str, spacy_model):
    doc = spacy_model(text)
    label_counts = {}

    for ent in doc.ents:
        k = f"ner_{ent.label_}"
        label_counts[k] = label_counts.get(k, 0) + 1
    
    for token in doc:
        k = f"pos_{token.pos_}"
        label_counts[k] = label_counts.get(k, 0) + 1

    for token in doc:
        k = f"tag_{token.tag_}"
        label_counts[k] = label_counts.get(k, 0) + 1

    for token in doc:
        k = f"lemma_{token.lemma_}"
        label_counts[k] = label_counts.get(k, 0) + 1

    return label_counts

In [None]:
def sentiment_classification(text: str, model):
    return model(text)[0]["label"]

## Run functions and save records

In [None]:
spacy_model = spacy.load("en_core_web_sm")
sentiment_model = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis")

In [None]:
# 0 - Raw records
records = []
n = len(df)
for i in tqdm(range(n), desc="Raw records"):
    record_id = f"title_{i}"
    text = df.iloc[i]["title"]
    record = dict(text=text, score=int(df.iloc[i]["score"]))
    record = {
        **record,
        **syntactical_extraction(text, spacy_model)
    }
    record["sentiment"] = sentiment_classification(text, sentiment_model)
    records.append(record)


# 1 - Unificate keys
all_keys = []
for r in tqdm(records, desc="Collecting keys"):
    all_keys += list(r.keys())


# 2 - Add null keys
for i, record in tqdm(enumerate(records), desc="Keys unification"):
    for k in all_keys:
        records[i][k] = records[i][k] if k in records[i].keys() else 0


json.dump(
    records,
    open(f"resources/records.json", "w"),
    indent=4
)

In [None]:
df = pd.DataFrame(records)
base_cols = ["text", "score", "sentiment"]
ner_cols = [c for c in df.columns if c.startswith("ner")]
pos_cols = [c for c in df.columns if c.startswith("pos")]
tag_cols = [c for c in df.columns if c.startswith("tag")]
lemma_cols = [c for c in df.columns if c.startswith("lemma")]
df = df[base_cols + ner_cols + pos_cols + tag_cols + lemma_cols]
df.to_parquet("resources/dataframe_features.pq")

In [None]:
df.sample(5)

In [None]:
df.info()