In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD


from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from scipy.sparse import hstack, csr_matrix


In [2]:
# dt = pd.to_datetime(df["created_utc"], unit="s", utc=True, errors="coerce")
# df["hour_utc"] = dt.dt.hour
# df["dow_utc"] = dt.dt.dayofweek

# # df["over_18"] = df["over_18"].astype(int)
# # df["is_self"] = df["is_self"].astype(int)


# df["title"] = df["title"].fillna("").astype(str)
# df["subreddit"] = df["subreddit"].fillna("").astype(str)
def load_reddit_data(csv_path="/content/reddit_text_karma_dataset.csv"):
    df = pd.read_csv(csv_path)

    y = pd.to_numeric(df["score"], errors="coerce").values
    ok = ~np.isnan(y)
    df = df.loc[ok].reset_index(drop=True)
    y = y[ok]

    # dt = pd.to_datetime(df["created_utc"], unit="s", utc=True, errors="coerce")
    # df["hour_utc"] = dt.dt.hour
    # df["day_utc"] = dt.dt.dayofweek


    df["selftext"] = df["selftext"].fillna("").astype(str)

    df["text_all"] = (df["title"].str.strip() + "\n" + df["selftext"].str.strip()).str.strip()



    df_train, df_tmp, y_train, y_tmp = train_test_split(
        df, y, test_size=0.30, random_state=0
    )
    df_val, df_test, y_val, y_test = train_test_split(
        df_tmp, y_tmp, test_size=0.50, random_state=0
    )

    return (df_train, y_train), (df_val, y_val), (df_test, y_test)

(train_df, y_train), (val_df, y_val), (test_df, y_test) = load_reddit_data()

print("train:", train_df.shape, "val:", val_df.shape, "test:", test_df.shape)
print(train_df.head(1))


train: (1535, 16) val: (329, 16) test: (329, 16)
          id          subreddit  \
558  1ml7qir  explainlikeimfive   

                                             title  \
558  ELI5: Why can't we "ship of Theseus" the ISS?   

                                              selftext  score  upvote_ratio  \
558  Forgive me if this is a dumb question.\n\nMy u...   2400          0.93   

     ups_raw  downs_raw  ups_estimated  downs_estimated  num_comments  \
558     2400          0         2595.0            195.0           311   

      created_utc                                          permalink  over_18  \
558  1.754689e+09  https://www.reddit.com/r/explainlikeimfive/com...    False   

     is_self                                           text_all  
558     True  ELI5: Why can't we "ship of Theseus" the ISS?\...  


In [3]:

# num_cols = [
#     "hour_utc","dow_utc",
#     "over_18","is_self",
# ]
# Xtr_num = train_df[num_cols].fillna(0).astype(float).values
# Xva_num = val_df[num_cols].fillna(0).astype(float).values
# Xte_num = test_df[num_cols].fillna(0).astype(float).values

# Xtr_num = csr_matrix(Xtr_num)
# Xva_num = csr_matrix(Xva_num)
# Xte_num = csr_matrix(Xte_num)

def build_feature_matrices(train_df, val_df, test_df):
    tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)
    Xtr_text = tfidf.fit_transform(train_df["text_all"])
    Xva_text = tfidf.transform(val_df["text_all"])
    Xte_text = tfidf.transform(test_df["text_all"])

    ohe = OneHotEncoder(handle_unknown="ignore")
    Xtr_cat = ohe.fit_transform(train_df[["subreddit"]])
    Xva_cat = ohe.transform(val_df[["subreddit"]])
    Xte_cat = ohe.transform(test_df[["subreddit"]])

    # Xtr_time = csr_matrix(train_df[["hour_utc", "day_utc"]])
    # Xva_time = csr_matrix(val_df[["hour_utc", "day_utc"]])
    # Xte_time = csr_matrix(test_df[["hour_utc", "day_utc"]])

    Xtr = hstack([Xtr_text, Xtr_cat]).tocsr()
    Xva = hstack([Xva_text, Xva_cat]).tocsr()
    Xte = hstack([Xte_text, Xte_cat]).tocsr()

    return Xtr, Xva, Xte, tfidf, ohe

X_train, X_val, X_test, tfidf, ohe = build_feature_matrices(train_df, val_df, test_df)
print("X shapes:", X_train.shape, X_val.shape, X_test.shape)



X shapes: (1535, 20010) (329, 20010) (329, 20010)


In [4]:
baseline_pred_val = np.full_like(y_val, y_train.mean(), dtype=float)
baseline_pred_test = np.full_like(y_test, y_train.mean(), dtype=float)

baseline_rmse_val = np.sqrt(mean_squared_error(y_val, baseline_pred_val))
baseline_rmse_test = np.sqrt(mean_squared_error(y_test, baseline_pred_test))

print("Baseline: val " + str(baseline_rmse_val) + " test " + str(baseline_rmse_test))


Baseline: val 7008.487781391399 test 8204.000354596312


In [40]:

# ridge = Ridge()
# ridge.fit(X_train, y_train)

# rmse_test = np.sqrt(mean_squared_error(y_test, ridge.predict(X_test)))
# rmse_val  = np.sqrt(mean_squared_error(y_val,  ridge.predict(X_val)))
# print("Ridge: val " + str(rmse_val) + " test " + str(rmse_test))


# Compress sparse X to a manageable dense matrix for tree models

#perform worse
# svd = TruncatedSVD(n_components=300, random_state=0)

# X_train_svd = svd.fit_transform(X_train)
# X_val_svd   = svd.transform(X_val)
# X_test_svd  = svd.transform(X_test)

# print("SVD shapes:", X_train_svd.shape, X_val_svd.shape, X_test_svd.shape)

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def metrics_row(name, yv, pv, yt, pt):
    return {
        "model": name,
        "val_rmse":  rmse(yv, pv),
        "test_rmse": rmse(yt, pt),
        "val_mae":   float(mean_absolute_error(yv, pv)),
        "test_mae":  float(mean_absolute_error(yt, pt)),
        "val_r2":    float(r2_score(yv, pv)),
        "test_r2":   float(r2_score(yt, pt)),
    }



rows = []

rows.append(metrics_row(
    "Baseline(mean)",
    y_val,  np.full_like(y_val,  y_train.mean(), dtype=float),
    y_test, np.full_like(y_test, y_train.mean(), dtype=float),
))

ridge = Ridge(random_state=0, alpha= 1)
ridge.fit(X_train, y_train)
rows.append(metrics_row(
    "Ridge",
    y_val,  ridge.predict(X_val),
    y_test, ridge.predict(X_test),
))

dt = DecisionTreeRegressor(random_state=0, max_depth=10, min_samples_leaf=5)
dt.fit(X_train, y_train)
rows.append(metrics_row(
    "DecisionTree",
    y_val,  dt.predict(X_val),
    y_test, dt.predict(X_test),
))

rf = RandomForestRegressor(random_state=0, max_depth=30, min_samples_leaf=5, n_estimators=100)
rf.fit(X_train, y_train)
rows.append(metrics_row(
    "RandomForest",
    y_val,  rf.predict(X_val),
    y_test, rf.predict(X_test),
))

gbr = GradientBoostingRegressor(random_state=0, n_estimators=250, learning_rate=0.1, max_depth=5)
gbr.fit(X_train, y_train)
rows.append(metrics_row(
    "GradientBoosting",
    y_val,  gbr.predict(X_val),
    y_test, gbr.predict(X_test),
))

results = pd.DataFrame(rows).sort_values("val_rmse").reset_index(drop=True)
results



Unnamed: 0,model,val_rmse,test_rmse,val_mae,test_mae,val_r2,test_r2
0,DecisionTree,3194.267169,4885.143028,1610.23716,2381.708827,0.791538,0.644049
1,RandomForest,3238.699996,4673.101867,1564.738007,2270.614396,0.785698,0.674279
2,GradientBoosting,3882.997302,4767.329394,1822.042861,2335.328266,0.691952,0.661011
3,Ridge,3902.441863,4318.441219,1786.852926,2296.431738,0.688859,0.721843
4,Baseline(mean),7008.487781,8204.000355,4785.29055,5332.048749,-0.003535,-0.003892
