In [48]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [49]:

ds = load_dataset("google/civil_comments")
df = ds['train'].to_pandas()

In [50]:
df

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
1,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
2,This is such an urgent design problem; kudos t...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
3,Is this something I'll be able to install on m...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
4,haha you guys are a bunch of losers.,0.893617,0.021277,0.000000,0.0,0.872340,0.021277,0.0
...,...,...,...,...,...,...,...,...
1804869,"Maybe the tax on ""things"" would be collected w...",0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
1804870,What do you call people who STILL think the di...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
1804871,"thank you ,,,right or wrong,,, i am following ...",0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
1804872,Anyone who is quoted as having the following e...,0.621212,0.030303,0.030303,0.0,0.621212,0.045455,0.0


In [51]:
label = ['toxicity','severe_toxicity','obscene','threat','insult','identity_attack','sexual_explicit',]
x = df['text']
y = df[label]


In [52]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33, random_state=42)

In [53]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(x_train)
X_test_tfidf = vectorizer.transform(x_test)

In [54]:
model = LinearRegression()
model.fit(X_train_tfidf, y_train)

In [55]:
y_pred = model.predict(X_test_tfidf)

print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"R^2 score: {r2_score(y_test, y_pred)}")

Mean squared error: 0.006659904287740457
R^2 score: 0.3220669565599246


In [56]:
def get_comment_rating(comment):
    comment_tfidf = vectorizer.transform([comment])
    prediction = model.predict(comment_tfidf)
    return prediction[0]

In [89]:
coms = ['Kids those days have problem with sexual partners',"This is a terrible comment!", "This is a very nice comment! Thank you!", "I want to harm you.", "this is sexual explicit"]


In [91]:
for c in coms:
    res = get_comment_rating(c)
    scale = max(res) 
    num = np.where(res==scale)[0][0]
    print(c)
    print(f"This comment is {scale} {label[num]}")
    #print(res)
    print('')

Kids those days have problem with sexual partners
This comment is 0.24910488173207618 sexual_explicit

This is a terrible comment!
This comment is 0.1720392468476445 toxicity

This is a very nice comment! Thank you!
This comment is 0.024084886617257828 toxicity

I want to harm you.
This comment is 0.07673964230047718 toxicity

this is sexual explicit
This comment is 0.5020699496541304 sexual_explicit

