In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.decomposition import TruncatedSVD
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re
from sklearn.model_selection import train_test_split
nltk.download('words')

In [None]:
#load data
data = pd.read_csv("./jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
#create distinct df's for more toxic and less toxic comments

less_toxic = data.drop('more_toxic', axis=1)
more_toxic = data.drop('less_toxic', axis =1)

#add target column to both 

less_toxic['target'] = 0
more_toxic['target'] = 1

#rename text column for both 

less_toxic = less_toxic.rename(columns={'less_toxic':'text'})
more_toxic = more_toxic.rename(columns={'more_toxic':'text'})

#rejoin data 

comments = pd.concat([less_toxic, more_toxic], axis =0)

#randomly reorder

comments = comments.sample(frac=1).reset_index(drop=True)

#create average target score for comments 
comments = comments.groupby('text')['target'].mean().reset_index()

#shape
print(comments.head())

#save comments

comments.to_csv("comments.csv", index=False)

In [None]:
# #remap target values 

comments['target'] = (comments['target'] > 0.0) * 1

#histogram of toxic values per comment 
%matplotlib inline
sns.histplot(data=comments, x="target", binwidth=.5)
plt.show()


In [None]:
#create text clean + tokenizer function

punc = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~__'
words = set(nltk.corpus.words.words())
stop = stopwords.words('english')
stemmer = SnowballStemmer(language='english')
cv_tfidf = TfidfVectorizer(preprocessor=' '.join)





def to_token(text):
    
    
    text = [text.translate(str.maketrans('', '', string.punctuation))]
    text = [word_tokenize(word) for word in text]
    text = [item for sublist in text for item in sublist]
    text = [stemmer.stem(word) for word in text]
    text = [re.sub(r'http\S+', '', each) for each in text]
    text = [re.sub('[0-9+]', '', each) for each in text]
    text = [re.sub('_', '', each) for each in text]
    text = [re.sub("\n","",each) for each in text]
    text = [re.sub('/_/g', '', each) for each in text]
    text = [re.sub('[^\u0000-\u05C0\u2100-\u214F]+', '', each) for each in text]
    text = [re.sub('[\u0401\u0451\u0410-\u044f]', '', each) for each in text]
    text = [word for word in text if word not in stop]
    text = [word.lower() for word in text]
    text = ["".join(dict.fromkeys(word)) for word in text]

    

    return text

In [None]:
#split data into X and y

X, y = comments['text'], comments['target']
print(X.shape, y.shape)



In [None]:
#partition data into test data 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 33)


In [None]:
#create series of all unique comments in the dataset

train_corpus = [word for word in X_train]
test_corpus = [word for word in X_test]


# #tokenize train and test sets 

train_features = [to_token(comment) for comment in train_corpus]
test_features = [to_token(comment) for comment in test_corpus]

In [None]:
#create tfidf vector matrix for train and test sets 
tfidf = TfidfVectorizer(preprocessor=' '.join)

train_tfidf = tfidf.fit_transform(train_features)
test_tfidf = tfidf.transform(test_features)




In [None]:
#use truncated SVD on each tfidf matrix
svd = TruncatedSVD(n_components=25, random_state=33)

train_svd = svd.fit_transform(train_tfidf)
test_svd = svd.transform(test_tfidf)



In [None]:
#scale features

scale=StandardScaler()
X_train_scaled = scale.fit_transform(train_svd)
X_test_scaled = scale.transform(test_svd)



In [None]:
#check for distribution of labels
y_train.value_counts(normalize=True)

In [None]:
#create base model
lr_basemodel =LogisticRegression()

In [None]:
# train base model
lr_basemodel.fit(X_train_scaled,y_train)
y_pred_basemodel = lr_basemodel.predict(X_test_scaled)

In [None]:
#get base f1 and roc auc score for model
print("f1 score for base model is : " , f1_score(y_test,y_pred_basemodel))
print("ROC AUC score for base model is : ", roc_auc_score(y_test, y_pred_basemodel))

In [None]:
#Hyperparameter tuning
# define model/create instance
lr=LogisticRegression()
#tuning weight for minority class then weight for majority class will be 1-weight of minority class
#Setting the range for class weights
weights = np.linspace(0.0,0.99,500)
#specifying all hyperparameters with possible values
param= {'C': [0.1, 0.5, 1,10,15,20], 'penalty': [ 'l2'],"class_weight":[{0:x ,1:1.0 -x} for x in weights]}
# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
#Gridsearch for hyperparam tuning
model= GridSearchCV(estimator= lr,param_grid=param,scoring="roc_auc",cv=folds,return_train_score=True)
#train model to learn relationships between x and y
model.fit(X_train_scaled,y_train)

In [None]:
# print best hyperparameters
print("Best ROC AUC score: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
#Build Model with best params
lr2=LogisticRegression(class_weight={0: 0.6963727454909819, 1: 0.30362725450901806}
                       ,C=10,penalty="l2")
lr2.fit(X_train_scaled,y_train)

In [None]:
# predict probabilities on Test and take probability for class 1([:1])
y_pred_prob_test = lr2.predict_proba(X_test_scaled)[:, 1]
#predict labels on test dataset
y_pred_test = lr2.predict(X_test_scaled)
# create onfusion matrix
cm = confusion_matrix(y_test, y_pred_test)
print(f"confusion Matrix is : \n\n", cm)
print(f"\n")

# ROC- AUC score
print("ROC-AUC score  test dataset:  ", roc_auc_score(y_test,y_pred_prob_test))
#Precision score
print("precision score  test dataset:  ", precision_score(y_test,y_pred_test))
#Recall Score
print("Recall score  test dataset:  ", recall_score(y_test,y_pred_test))
#f1 score
print("f1 score  test dataset :  ", f1_score(y_test,y_pred_test))

In [None]:
X_test = pd.DataFrame(X_test)
X_test.head()

In [None]:
#create toxic probability output df
test_output = X_test.copy()
test_output['target'] = y_test.tolist()
test_output["model_probability"] = y_pred_prob_test.tolist()
test_output['prediction'] = y_pred_test.tolist()


#create toxic probability output for comments that are mislabeled 1
miss_0 =  test_output.loc[(test_output['target'] == 0) & (test_output['model_probability']>0.50)]
miss_0.head()

In [None]:
#create toxic probability output for comments that are mislabeled 0
miss_1 =  test_output.loc[(test_output['target'] == 1) & (test_output['model_probability']<0.50)]
miss_1.head()

In [None]:
#save probability score df

test_output.to_csv("lr_output.csv")

In [None]:
 lr2.coef_.T

In [None]:
miss_1.iloc[3,0]

In [None]:
miss_0.iloc[3,0]