In [2]:
import pandas as pd
import numpy as np
import re 
import scipy
from scipy import sparse

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import rankdata

from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict

import time
import scipy.optimize as optimize

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize

import tensorflow as tf
import keras.backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Concatenate
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers.merge import concatenate

from sklearn.model_selection import train_test_split

In [18]:
# Ridge Regression
def ridge_regression(vec, X, y, X_test, folds, stratified):
    skf = StratifiedKFold(n_splits=FOLDS,shuffle=True,random_state=1)
    val_scores = []
    X_less_toxics = []
    X_more_toxics = []

    preds = []
    for fold, (train_index, val_index) in enumerate(skf.split(X, stratified)):
        X_train, y_train = X[train_index], y[train_index]
        X_val, y_val = X[val_index], y[val_index]
        model = Ridge()
        model.fit(X_train, y_train)

        X_less_toxic = vec.transform(val_df['less_toxic'])
        X_more_toxic = vec.transform(val_df['more_toxic'])

        pred_less_toxic = model.predict(X_less_toxic)
        pred_more_toxic = model.predict(X_more_toxic)

        X_less_toxics.append(pred_less_toxic)
        X_more_toxics.append(pred_more_toxic)

        # Validation Accuracy
        val_acc = (pred_less_toxic < pred_more_toxic).mean()
        val_scores.append(val_acc)

        pred = model.predict(X_test)
        preds.append(pred)

        print(f"FOLD:{fold}, val_acc:{val_acc:.5f}")

    mean_val_acc = np.mean(val_scores)
    preds = np.mean(np.vstack(preds), axis=0)
    
    return mean_val_acc, preds

In [7]:
# Data processing
val_df = pd.read_csv("/content/jigsaw-toxic-severity-rating/validation_data.csv")
test_df = pd.read_csv("/content/jigsaw-toxic-severity-rating/comments_to_score.csv")
ruddit_df = pd.read_csv("/content/ruddit.csv")
print(len(val_df))
print(len(test_df))
print(len(ruddit_df))
print(val_df.head())
print(test_df.head())
print(ruddit_df.head())


ruddit_df['y'] = ruddit_df['offensiveness_score'].map(lambda x: 0.0 if x <=0 else x)
print(ruddit_df.head())

30108
7537
5838
   worker  ...                                         more_toxic
0     313  ...  WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1     188  ...   Daphne Guinness \n\nTop of the mornin' my fav...
2      82  ...  "Atom you don't believe actual photos of mastu...
3     347  ...  You seem to have sand in your vagina.\n\nMight...
4     539  ...           hey \n\nway to support nazis, you racist

[5 rows x 3 columns]
   comment_id                                               text
0      114890  "\n \n\nGjalexei, you asked about whether ther...
1      732895  Looks like be have an abuser , can you please ...
2     1139051  I confess to having complete (and apparently b...
3     1434512  "\n\nFreud's ideas are certainly much discusse...
4     2084821  It is not just you. This is a laundry list of ...
  post_id  ... offensiveness_score
0  42g75o  ...              -0.083
1  42g75o  ...              -0.022
2  42g75o  ...               0.167
3  42g75o  ...              -0.146
4

In [20]:
tfidf_vectorizer = TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3, ngram_range=(4, 6))
X = tfidf_vectorizer.fit_transform(ruddit_df['txt'])
X_test = tfidf_vectorizer.transform(test_df['text'])
#print(X)
#print(X_test)

stratified = (np.around(ruddit_df["y"].values, decimals = 1)*10).astype(int)
FOLDS = 5
mean_val_acc, ruddit_predictions =  ridge_regression(tfidf_vectorizer, X, ruddit_df["y"].values, X_test, FOLDS, stratified)
print("Mean accuracy on validaton data", mean_val_acc)
print("Predictions on ruddit", ruddit_predictions)



FOLD:0, val_acc:0.64687
FOLD:1, val_acc:0.63973
FOLD:2, val_acc:0.64671
FOLD:3, val_acc:0.65019
FOLD:4, val_acc:0.64405
Mean accuracy on validaton data 0.6455094991364421
Predictions on ruddit [0.12439541 0.12452538 0.08968182 ... 0.18093773 0.48498902 0.1053199 ]


In [23]:
test_df['score'] = ruddit_predictions
test_df[['comment_id', 'score']].to_csv("submission.csv", index=False)