In [1]:
import pandas as pd
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import datetime

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
def get_first_batch(path):
    parquet_file = pq.ParquetFile(path)
    for batch in parquet_file.iter_batches():
        return batch.to_pandas()

In [64]:
ARTICLE_DATA_PATH = "data/Drive Daten/students_articles_enriched_1.parquet.gzip"
PAGEVIEW_DATA_PATH = "data/Drive Daten/students_pageviews_2021-01-02.parquet.gzip"

In [103]:
import glob
import os

In [None]:
article_data = get_first_batch(ARTICLE_DATA_PATH)


pageview_data = pd.concat([get_first_batch(f) for f in glob.glob(f"{os.getcwd()}/data/Drive Daten/students_pageviews*.gzip")])

pageview_data = get_first_batch(PAGEVIEW_DATA_PATH)

In [None]:
# Remove new lines from full text for ARI
article_data["article_full_text"] = article_data["article_full_text"].apply(lambda x: x.replace("\n", " "))

In [None]:
# Calculate text complexity using rounded ARI
def ARI(text):
    score = 0.0 
    if len(text) > 0:
        score = round(4.71 * (len(text) / len(text.split()) ) +  0.5 * ( len(text.split()) / len(text.split('.'))) - 21.43)
        return score if score > 0 else 0
article_data["ARI"] = article_data["article_full_text"].apply(lambda x:ARI(x))

In [None]:
# Get all columns
article_data.columns

In [None]:
article_data.head()

In [None]:
# Remove all plain text columns
PLAIN_TEXT_COLUMNS = ["article_header", "article_teaser", "article_full_text", "article_preview_emotion"]
article_data.drop(columns=PLAIN_TEXT_COLUMNS, inplace=True)

In [None]:
article_data.head()

In [None]:
# Fill missing data
article_data[["topic", "locality", "newstype", "genre"]] = article_data[["topic", "locality", "newstype", "genre"]].fillna("")

EMO_COLUMNS = ["pad_pleasure", "pad_arousal", "pad_dominance", "preview_pad_pleasure", "preview_pad_arousal", "preview_pad_dominance", "emo_aerger", "emo_erwarten", "emo_ekel", "emo_furcht", "emo_freude", "emo_traurigkeit", "emo_ueberraschung", "emo_vertrauen", "preview_emo_aerger", "preview_emo_erwarten", "preview_emo_ekel", "preview_emo_furcht", "preview_emo_freude", "preview_emo_traurigkeit", "preview_emo_ueberraschung", "preview_emo_vertrauen"]

article_data[EMO_COLUMNS] = article_data[EMO_COLUMNS].fillna(0)

In [None]:
article_data.head()

In [None]:
pageview_data.head()

In [None]:
pageview_data_grouped = pageview_data[["article_drive_id", "time_engaged_in_s"]].groupby(["article_drive_id"]).sum(["time_engaged_in_s"]).reset_index()

In [None]:
merged_data = pd.merge(article_data, pageview_data_grouped, on="article_drive_id")
merged_data.columns

In [None]:
merged_data

In [None]:
# merged_data["days_published_ago"] = merged_data.apply(lambda x: max(0, (datetime.datetime(2021, 1, 1) - x.published_at_local).days), axis=1)
# merged_data

In [None]:
COLUMNS_TO_DROP = ["published_at_local", "modified_at_local", "publisher_id"]
merged_data["is_plus_article"].fillna(False, inplace=True)
merged_data = pd.merge(merged_data, pageview_data[["article_drive_id", "portal_id"]], on="article_drive_id")
merged_data_cleaned = pd.get_dummies(merged_data.drop(columns=COLUMNS_TO_DROP), columns=["topic", "locality", "newstype", "genre", "portal_id"])
# grouped_data = merged_data_cleaned.groupby(["article_drive_id"]).sum(["time_engaged_is"]).mean().reset_index()
# grouped_data
merged_data_cleaned

In [None]:
abs(merged_data_cleaned.corr())

In [None]:
import seaborn as sns
sns.set(rc={'figure.figsize':(20,20)})
sns.heatmap(merged_data_cleaned.corr())

In [None]:
abs(merged_data_cleaned.corr()["time_engaged_in_s"]).sort_values(ascending=False)

In [None]:
merged_data_cleaned_no_dummies = merged_data.drop(columns=COLUMNS_TO_DROP)

features = list(filter(lambda x: x not in ["time_engaged_in_s", "article_drive_id"], merged_data_cleaned_no_dummies.columns))
features

In [None]:
merged_data_cleaned["time_engaged_in_s"]

In [None]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

X, y = pd.get_dummies(merged_data_cleaned_no_dummies[features]), merged_data_cleaned["time_engaged_in_s"].array
X_train, X_test, y_train, y_test = train_test_split(X, y)




In [None]:
svm = LinearSVR()

In [None]:
svm.fit(X_train, y_train)

In [None]:
svm.score(X_test, y_test)

In [None]:

svm.predict(X_test)

In [None]:
X_test

In [None]:
y_test

In [None]:
y_pred = svm.predict(X_test)

In [None]:
sns.histplot(y_test)

In [None]:
sns.histplot(y_pred)