# Imports

In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
import lightgbm as lgb

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.neural_network import MLPRegressor
import re 
import scipy
from scipy import sparse
import gc 

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 
import pickle


import time
import scipy.optimize as optimize
import warnings


from datetime import datetime 
import psutil

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100


In [4]:
def timer(func):
    def wrapper(*args, **kws):
        st = time.time()
        res = func(*args, **kws)
        et = time.time()
        tt = (et-st)/60
        print(f'Time taken is {tt:.2f} mins')
        return res
    return wrapper


In [5]:
# first competition dataset

df_class_train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
# df_class_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
# df_class_test_label=pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv").replace(-1,0)
# df_class_test = pd.merge(df_class_test, df_class_test_label, how="left", on = "id")
df_class_train['severe_toxic'] = df_class_train.severe_toxic * 2
df_class_train['y'] = (df_class_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int) 
df_class_train['y'] = df_class_train['y']/df_class_train['y'].max()
df_class_train = df_class_train[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

# df_class_train[df_class_train.y>0].sort_values(by=['y'],ascending=False)

In [6]:
# large_train is add the worker infomation
# df_bias_large_train=pd.read_csv(('../input/jigsaw-unintended-bias-in-toxicity-classification/toxicity_individual_annotations.csv'))

df_bias_train=pd.read_csv(('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv'))

toxic_relevance=['severe_toxicity', 'obscene','identity_attack', 'insult', 'threat']
# df_bias_train=df_bias_train[toxic_relevance]
df_bias_train['severe_toxicity'] = df_bias_train.severe_toxicity * 2
df_bias_train['y'] = (df_bias_train[toxic_relevance].sum(axis=1) ).astype(int) 
df_bias_train['y'] = df_bias_train['y']/df_bias_train['y'].max()
df_bias_train = df_bias_train[['comment_text', 'y']].rename(columns={'comment_text': 'text'})


toxic_count=df_bias_train[df_bias_train.y>0].shape[0]
sample_number=int(2e5)
df_bias_train=df_bias_train.sample(n=sample_number,random_state=1, axis=0).append(df_bias_train[df_bias_train.y>0])

# df_bias_train

In [7]:
# third competition dataset
df_multi_train = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
df_multi_train['severe_toxic'] = df_multi_train.severe_toxic * 2
df_multi_train['y'] = (df_multi_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df_multi_train['y'] = df_multi_train['y']/df_multi_train['y'].max()
df_multi_train = df_multi_train[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
# df_multi_train

In [8]:
# print(len(set(df_class_train.text.unique()) & set(df_multi_train.text.unique())))
# print(len(set(df_class_train.text.unique())))
# print(len(set(df_multi_train.text.unique())))
# print(len(set(df_bias_train.text.unique()) & set(df_multi_train.text.unique())))

In [9]:
# ruddit dataset 
df_ruddit_train=pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
df_ruddit_train = df_ruddit_train[['txt', 'offensiveness_score']].rename(columns={'txt': 'text',
                                                                'offensiveness_score':'y'})
df_ruddit_train['y'] = (df_ruddit_train['y'] - df_ruddit_train.y.min()) / (df_ruddit_train.y.max() - df_ruddit_train.y.min()) 
# df_ruddit_train.shape

In [10]:

df_rate_score=pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_rate_sub=pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
df_rate_val=pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [24]:
class DataCleaner_v0(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args and **kwargs
        super().__init__()
    
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X, y=None):   

        # Clean some punctutations
        data_temp=X.copy()
        data_temp = data_temp.str.replace('\n', ' \n ')
        # Remove ip address
        data_temp =    data_temp.str.replace(r'(([0-9]+\.){2,}[0-9]+)',' ')
        data_temp = data_temp.str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
        # Replace repeating characters more than 3 times to length of 3
        data_temp= data_temp.str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
        # patterns with repeating characters 
        data_temp = data_temp.str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
        data_temp =data_temp.str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
        data_temp = data_temp.str.replace(r'[ ]{2,}',' ').str.strip()   
        # Add space around repeating characters
        data_temp =data_temp.str.replace(r'([*!?\']+)',r' \1 ')     
        return data_temp


class DataCleaner_v1(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args and **kwargs
        super().__init__()
    
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X, y=None):   
        '''
                Cleans text into a basic form for NLP. Operations include the following:-
                1. Remove special charecters like &, #, etc
                2. Removes extra spaces
                3. Removes embedded URL links
                4. Removes HTML tags
                5. Removes emojis
                
                text - Text piece to be cleaned.
                '''

        data_temp=X.copy()
        data_temp=data_temp.str.replace(r'https?://\S+|www\.\S+',r'')

        # soup = BeautifulSoup(data[col].str., 'lxml') #Removes HTML tags
        # only_text = soup.get_text()
        # data[col] = only_text

        data_temp=data_temp.str.replace("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport &  symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                "]+", r'')
        data_temp=data_temp.str.replace(r"[^a-zA-Z\d]", " ")#Remove Extra Spaces
        data_temp=data_temp.str.replace(' +', ' ')#Remove Extra Spaces
        data_temp = data_temp.str.strip()  
        return data_temp


In [21]:
a=DataCleaner_v0()
print(a.transform(test_clean_df.text))
display(test_clean_df)


0                                 heyy\n\nkkdsfj
1                         hi   how/are/you  ??? 
2                                     hey ????? 
3                 hey ?????  18.98.333.20 18.98.
4               noooo !!!!!!!!!    comeone  !!  
5    cooooooooool     brooooooooooo  coool brooo
6                                   naaaahhhhhhh
Name: text, dtype: object


Unnamed: 0,text
0,heyy\n\nkkdsfj
1,hi how/are/you ???
2,hey?????
3,hey????? 18.98.333.20 18.98.
4,noooo!!!!!!!!! comeone !!
5,cooooooooool brooooooooooo coool brooo
6,naaaahhhhhhh


In [26]:

# tfidf_dict=[
#             {"tfidf0":TfidfVectorizer(min_df= 3, max_df=0.5,analyzer = 'char_wb', ngram_range = (3,5))},
#             {"tfidf1":TfidfVectorizer(min_df= 3, max_df=0.5,max_features=2000,analyzer = 'char_wb', ngram_range = (3,5))},
#             {"tfidf2":TfidfVectorizer(min_df= 3, max_df=0.5,analyzer = 'word', ngram_range = (1,2))},
#             {"tfidf3":TfidfVectorizer(min_df= 3, max_df=0.5,analyzer = 'word', ngram_range = (1,5))},
# ]

clean_dict={
            "clean0":DataCleaner_v0(),
            "clean1":DataCleaner_v1(),
}


tfidf_dict={
            "tfidf0":TfidfVectorizer(min_df= 3, max_df=0.5,analyzer = 'char_wb', ngram_range = (3,5)),
            "tfidf1":TfidfVectorizer(min_df= 3, max_df=0.5,max_features=2000,analyzer = 'char_wb', ngram_range = (3,5)),
            "tfidf2":TfidfVectorizer(min_df= 3, max_df=0.5,analyzer = 'word', ngram_range = (1,2)),
            "tfidf3":TfidfVectorizer(min_df= 3, max_df=0.5,analyzer = 'word', ngram_range = (1,5)),
}

model_dict={
            # "lasso":Lasso(),
            "ridge":Ridge(),
            "svm":SVR(kernel="linear"),
            "mlp":MLPRegressor(),            
            "lgb": lgb.LGBMRegressor(objective='regression',
                        num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
}

train_data_dict={'class_train':df_class_train ,
'bias_train':df_bias_train,
'multi_train':df_multi_train,
'ruddit_train':df_ruddit_train}


pipline=Pipeline(
    
    [
        (list(tfidf_dict)[0],list(tfidf_dict.values())[0]),
        (list(model_dict)[1],list(model_dict.values())[1]),
    ]
)




# vectorizer.fit_transform(a.text)
# len(vectorizer.get_feature_names())
# vectorizer.get_feature_names()

In [28]:
record_path='./record.csv'
                

i=0
for a in range(0,len(list(train_data_dict))):
    for b in range(0,len(list(clean_dict))):
        for c in range(0,len(list(tfidf_dict))):
            for d in range(0,len(list(model_dict))):
                
                print(u'当前进程的内存使用：%.4f GB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024) )               
                record=pd.read_csv(record_path)
                
                if i<record.shape[0]:
                    i+=1
                    continue
                
                print("Version ",i)
                data_name=list(train_data_dict)[a]
                data_model=list(train_data_dict.values())[a]

                clean_name=list(clean_dict)[b]
                clean_model=list(clean_dict.values())[b]

                idf_name=list(tfidf_dict)[c]
                idf_model=list(tfidf_dict.values())[c]

                regression_name=list(model_dict)[d]
                regression_model=list(model_dict.values())[d]


                
                data_size=data_model.shape[0]
          


                size =record.index.size



                pipeline=Pipeline(
                [
                    (clean_name,clean_model),
                    (idf_name,idf_model),
                    (regression_name,regression_model),
                ]
                )
                    

                n_folds=1
                val_preds_arr1_tmp = np.zeros((df_rate_val.shape[0], n_folds))
                val_preds_arr2_tmp = np.zeros((df_rate_val.shape[0], n_folds))
                test_preds_arr_tmp = np.zeros((df_rate_sub.shape[0], n_folds))

                for fld in range(n_folds):

                
                    print(f' ****************************** FOLD: {fld} ******************************')
                    # for test
                    df = data_model.iloc[0:5000,:]

                    #real train
                    # df = data_model
                    # Train the pipeline
                    print('Start Train')
                    start=datetime.now() 
                    pipeline.fit(df['text'], df['y'])
                   
                    print(u'当前进程的内存使用：%.4f GB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024) )    

                    # What are the important features for toxicity
                    feature_number=len(pipeline[idf_name].get_feature_names())
                    print('Total number of features:', feature_number )

                    # feature_wts = sorted(list(zip(pipeline[idf_name].get_feature_names(), 
                    #                                 np.round(pipeline[regression_name].coef_,2) )), 
                    #                         key = lambda x:x[1], 
                    #                         reverse=True)

                    # display(pd.DataFrame(feature_wts[:50], columns = ['feat','val']).T)
                    #.plot('feat','val',kind='barh',figsize = (8,8) )
                    #plt.show()
                    print("predict validation data ")
                    val_preds_arr1_tmp[:,fld] = pipeline.predict(df_rate_val['less_toxic'])
                    val_preds_arr2_tmp[:,fld] = pipeline.predict(df_rate_val['more_toxic'])

                    # print("\npredict test data ")
                    # test_preds_arr_tmp[:,fld] = pipeline.predict(clean_v0(df_rate_sub,'text')['text'])

                    p1 = val_preds_arr1_tmp.mean(axis=1)
                    p2 = val_preds_arr2_tmp.mean(axis=1)
                    

                    val_score=np.round((p1 < p2).mean(),4)
                    print(f'Validation Accuracy is {val_score}')
                    
                    end=datetime.now() 
                    trainning_time=(end-start).seconds
                    
                    model_path='./model/{}.pkl'.format(i)

                    record.loc[size] = [data_name,data_size,clean_name,idf_name,feature_number,regression_name,trainning_time,val_score,model_path]

                    with open('./model/{}.pkl'.format(i),'wb') as fw:
                        pickle.dump(pipeline,fw)

                record.to_csv(record_path,index=False)
                i+=1
    #加载svm.pickle
# with open('new_app_model_v1.pickle','rb') as fr:
#     newpipeline = pickle.load(fr)  
# newpipeline.predict(df_rate_val['more_toxic'])

当前进程的内存使用：1.0674 GB
当前进程的内存使用：1.0675 GB
当前进程的内存使用：1.0675 GB
当前进程的内存使用：1.0675 GB
当前进程的内存使用：1.0675 GB
当前进程的内存使用：1.0675 GB
当前进程的内存使用：1.0675 GB
当前进程的内存使用：1.0675 GB
当前进程的内存使用：1.0675 GB
当前进程的内存使用：1.0675 GB
Version  9
 ****************************** FOLD: 0 ******************************
Start Train
当前进程的内存使用：1.1158 GB
Total number of features: 21678
predict validation data 
Validation Accuracy is 0.654
当前进程的内存使用：1.1405 GB
Version  10
 ****************************** FOLD: 0 ******************************
Start Train
当前进程的内存使用：1.1849 GB
Total number of features: 21678
predict validation data 
Validation Accuracy is 0.639
当前进程的内存使用：1.1852 GB
Version  11
 ****************************** FOLD: 0 ******************************
Start Train
当前进程的内存使用：1.1880 GB
Total number of features: 21678
predict validation data 
Validation Accuracy is 0.6168
当前进程的内存使用：1.1900 GB
Version  12
 ****************************** FOLD: 0 ******************************
Start Train
当前进程的内存使用：1.3020 GB
Total number of featur