In [None]:
import numpy as np
import pandas as pd
import gc
import re
import string
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tqdm.auto import tqdm
from collections import OrderedDict
from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Ridge  # , Lasso, BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

## Load data

In [None]:
gc.collect()

In [None]:
train_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')

In [None]:
target_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
train_df.head()

In [None]:
stop_words = {'over', 'is', 'than', 'can', 'these', "isn't", 'so', 'my', 'each', 'an', 'between',
              'through', 'up', 'where', 'hadn', 'very', "you'll", 'while', "weren't", 'too', 'doesn',
              'only', 'needn', 'has', 'just', 'd', 'some', 'into', 've', 'didn', 'further', 'why', 
              'mightn', 'and', 'haven', 'own', "mightn't", 'during', 'both', 'me', 'shan', "doesn't",
              'theirs', 'herself', 'the', 'few', 'our', 'its', 'yourself', 'under', 'at', "you've",
              're', 'themselves', 'y', 'ma', 'because', 'him', 'above', 'such', 'we', "wouldn't", 'of',
              'from', 'hers', 'nor', "shouldn't", 'a', 'hasn', 'them', 'myself', 'this', 'being', 
              'your', 'those', 'i', 'if', 'couldn', 'not', 'will', 'it', 'm', 'to', 'isn', 'aren', 
              'when', 'o', 'about', 'their', 'more', 'been', "needn't", 'had', 'll', 'most', 'against', 
              'once', 'how', "didn't", "shan't", 'there', 'all', "should've", 'he', "don't", 'she', 
              'which', 'below', 'on', 'no', 'yourselves', "wasn't", 'shouldn', 'by', 'be', 'have', 
              'does', "aren't", 'itself', 'same', 'should', 'in', 'before', 'am', "won't", 'having', 
              "you'd", 'mustn', 'for', "that'll", 'that', "couldn't", 'wasn', 'won', "hasn't", 'as', 
              'until', 'wouldn', "mustn't", 'his', 'ain', "you're", 'out', "she's", 'other', 'are',
              't', 'you', 'off', 'yours', 'ourselves', 'himself', 'down', "haven't", 'ours', 'now',
              "hadn't", 'do', 's', 'her', 'with', "it's", 'then', 'weren', 'any', 'after', 'whom',
              'what', 'who', 'but', 'again', 'here', 'did', 'doing', 'were', 'they', 'was', 'or', 'don'}

In [None]:
patterns = [
   (r'won\'t', 'will not'),
   (r'can\'t', 'cannot'),
   (r'i\'m', 'i am'),
   (r'(\w+)\'ll', '\g<1> will'),
   (r'(\w+)n\'t', '\g<1> not'),
   (r'(\w+)\'ve', '\g<1> have'),
   (r'(\w+)\'s', '\g<1> is'),
   (r'(\w+)\'re', '\g<1> are'),
]

In [None]:
class REReplacer(object):
    def __init__(self, pattern = patterns):
        self.pattern = [(re.compile(regex), repl) for (regex, repl) in patterns]
    def replace(self, text):
        s = text
        for (pattern, repl) in self.pattern:
            s = re.sub(pattern, repl, s)
        return s

In [None]:
longest_word_len = len('Supercalifragilisticexpialidocious')

In [None]:
def clean_text(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r"@\S+", '', text) # remove @nicknames
    text = re.sub(r"\$", '', text) # remove $
    text = re.sub(r"https?:\/\/.*[\r\n]*", "", text) # remove URLs
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' ', text) # remove IPs
    text = re.sub(r'#', '', text) # remove hashtags
    text = re.sub(r"[^a-zA-Z\d]", " ", text)  # Remove special Charecters
        
    for symbol in string.punctuation:   # NB! TRY W/O THIS!
        text = text.replace(symbol, '')
    rep_word = REReplacer()
    text = rep_word.replace(text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    
    text = ' '.join([word.lower() for word in text.split() if word not in stop_words])
    text = ''.join([lemmatizer.lemmatize(stemmer.stem(word)) if (len(word) <= longest_word_len) else word for word in text.split()])
    text = emoji_pattern.sub(r'', text)
    text = re.sub(' +', ' ', text)  # Remove Extra Spaces
    text = text.strip()  # remove spaces at the beginning and at the end of string
        
    return text

In [None]:
tqdm.pandas()
train_df['clean_text'] = train_df['comment_text'].progress_apply(clean_text)

In [None]:
train_df['y'] = train_df['toxic'] * 1.0 \
                + train_df['severe_toxic'] * 1.7 \
                + train_df['obscene'] * 1.2 \
                + train_df['threat'] * 1.5 \
                + train_df['insult'] * 1.3 \
                + train_df['identity_hate'] * 1.5

train_df['y'] = train_df['y'] / train_df['y'].max()
train_df[train_df['y'] != 0].head()

In [None]:
%%time
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5))
tfidf_fit = vec.fit_transform(train_df['clean_text'])
tfidf_fit

In [None]:
# %%time
# for alpha in [0.1, 0.5, 1, 2]:
#     model = Ridge(alpha=alpha)
#     a = cross_val_score(model, tfidf_fit, train_df['toxic'], cv=3)
#     print(a)

In [None]:
# model = Ridge(alpha=1.1315789473684211)
model = Ridge(alpha=1)
# cross_val_score(model, tfidf_fit, train_df['y'], cv=3)

In [None]:
%%time
model.fit(tfidf_fit, train_df['y'])

In [None]:
model.coef_.shape
sorted_index_array = np.argsort(-np.abs(model.coef_))
top = sorted_index_array[:20]
for idx in top:
    print(vec.get_feature_names()[idx])

In [None]:
validation_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

In [None]:
validation_df.head()

In [None]:
tqdm.pandas()
validation_df['clean_less'] = validation_df['less_toxic'].progress_apply(clean_text)
validation_df['clean_more'] = validation_df['more_toxic'].progress_apply(clean_text)

In [None]:
validation_df.head()

In [None]:
%%time
x_less_toxic = vec.transform(validation_df['clean_less'])
x_more_toxic = vec.transform(validation_df['clean_more'])

In [None]:
predlt = model.predict(x_less_toxic)
predmt = model.predict(x_more_toxic)

In [None]:
(predlt < predmt).mean()

In [None]:
submission_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
x_test = vec.transform(submission_df['text'])
pred3 = model.predict(x_test)

In [None]:
submission_df['score'] = pred3

In [None]:
submission_df.head()

In [None]:
submission_df['score'].count()

In [None]:
submission_df['score'].nunique()

In [None]:
submission_df['score'] = submission_df['score'].apply(lambda x: x * 1000000)

In [None]:
dup_scores = submission_df[submission_df.duplicated('score')]['score']
dup_scores.iloc[0]
dup_scores.iloc[0] in np.array(dup_scores)

In [None]:
for score in submission_df[submission_df.duplicated('score')]['score']:
    print(submission_df[submission_df['score'] == score])

In [None]:
submission_df['randd_score'] = submission_df['score'].apply(lambda x: x + np.random.randint(-100000, 100000) / 100000000 if x in np.array(dup_scores) else x)

In [None]:
submission_df[submission_df.duplicated('randd_score')]

In [None]:
submission_df.head()

In [None]:
submission_df[['comment_id', 'randd_score']].to_csv("submission.csv", header=['comment_id', 'score'], index=False)