In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import sklearn.metrics as sm
from scipy.sparse import hstack

In [2]:
# class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

total = pd.read_csv('./data/newAPIs2.csv').fillna(' ')
# test = pd.read_csv('../data/test.csv').fillna(' ')

domain = pd.read_csv('./data/tagnet.csv')

class_names = domain['source'].drop_duplicates().values

test = total.sample(frac=0.2,axis=0,random_state=0)
train = total[~total['id'].isin(test['id'].values)]


train_text = train['descr']
test_text = test['descr']
all_text = pd.concat([train_text, test_text])

In [None]:
labels = dict()
for cla in tqdm(class_names):
    clist = list()
    for idx, row in train.iterrows():
        if row['tags2'].find(cla) == -1:
            clist.append(0)
        else:
            clist.append(1)
    labels[cla] = clist


 73%|██████████████████████████████████████████████████████████▏                     | 343/472 [03:24<01:16,  1.68it/s]

In [None]:
label2 = labels.copy()
class_names2 = list(class_names) 

In [None]:
key = list(labels.keys())
for k in key:
    if sum(labels[k]) < 100:
        labels.pop(k)
#         print(k)
#         class_names2.remove(k)

In [None]:
class_names2 = list(labels.keys())
len(class_names2)

In [3]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)



char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])



In [4]:
train_features

<10335x60000 sparse matrix of type '<class 'numpy.float64'>'
	with 14441400 stored elements in COOrdinate format>

In [61]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in tqdm(class_names2):
    train_target = labels[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

  0%|                                                                                           | 0/97 [00:00<?, ?it/s]

CV score for class Mapping is 0.9631280919365904


  1%|▊                                                                                  | 1/97 [00:09<15:11,  9.50s/it]

CV score for class Other is 0.7532929400414771


  2%|█▋                                                                                 | 2/97 [00:19<15:20,  9.68s/it]

CV score for class Data is 0.8586120995563059


  3%|██▌                                                                                | 3/97 [00:28<14:57,  9.55s/it]

CV score for class Tools is 0.7916747115747592


  4%|███▍                                                                               | 4/97 [00:38<15:03,  9.72s/it]

CV score for class Design is 0.8715832140958685


  5%|████▎                                                                              | 5/97 [00:48<14:45,  9.63s/it]

CV score for class eCommerce is 0.9295901420850127


  6%|█████▏                                                                             | 6/97 [00:57<14:36,  9.63s/it]

CV score for class Authentication is 0.9379073109136743


  7%|█████▉                                                                             | 7/97 [01:07<14:24,  9.60s/it]

CV score for class Education is 0.9425055191424064


  8%|██████▊                                                                            | 8/97 [01:16<14:14,  9.60s/it]

CV score for class Games is 0.9637400491606757


  9%|███████▋                                                                           | 9/97 [01:25<13:54,  9.49s/it]

CV score for class Audio is 0.9644096992355312


 10%|████████▍                                                                         | 10/97 [01:34<13:45,  9.49s/it]

CV score for class Real Time is 0.8756807122540758


 11%|█████████▎                                                                        | 11/97 [01:45<13:41,  9.56s/it]

CV score for class Internet of Things is 0.9463129063822903


 12%|██████████▏                                                                       | 12/97 [01:55<13:37,  9.61s/it]

CV score for class Monitoring is 0.9263413354322445


 13%|██████████▉                                                                       | 13/97 [02:06<13:34,  9.70s/it]

CV score for class Video is 0.9672030378468296


 14%|███████████▊                                                                      | 14/97 [02:16<13:29,  9.76s/it]

CV score for class Recognition is 0.9721784145483058


 15%|████████████▋                                                                     | 15/97 [02:26<13:19,  9.76s/it]

CV score for class Images is 0.9626896599359934


 16%|█████████████▌                                                                    | 16/97 [02:35<13:09,  9.75s/it]

CV score for class Database is 0.8957000888167478


 18%|██████████████▎                                                                   | 17/97 [02:47<13:08,  9.86s/it]

CV score for class Shipping is 0.9817292052274386


 19%|███████████████▏                                                                  | 18/97 [02:58<13:05,  9.94s/it]

CV score for class Machine Learning is 0.9519972025087354


 20%|████████████████                                                                  | 19/97 [03:10<13:01, 10.02s/it]

CV score for class Natural Language Processing is 0.9773886100907675


 21%|████████████████▉                                                                 | 20/97 [03:21<12:56, 10.09s/it]

CV score for class Notifications is 0.9218685812005006


 22%|█████████████████▊                                                                | 21/97 [03:31<12:45, 10.08s/it]

CV score for class Cloud is 0.938501295111528


 23%|██████████████████▌                                                               | 22/97 [03:41<12:34, 10.07s/it]

CV score for class Location is 0.9327844819880218


 24%|███████████████████▍                                                              | 23/97 [03:53<12:30, 10.15s/it]

CV score for class Localization is 0.9134732934474364


 25%|████████████████████▎                                                             | 24/97 [04:05<12:25, 10.22s/it]

CV score for class Travel is 0.9569113015155991


 26%|█████████████████████▏                                                            | 25/97 [04:16<12:17, 10.24s/it]

CV score for class Entertainment is 0.8854491452898245


 27%|█████████████████████▉                                                            | 26/97 [04:27<12:10, 10.29s/it]

CV score for class Mobile is 0.898911283967978


 28%|██████████████████████▊                                                           | 27/97 [04:37<11:59, 10.27s/it]

CV score for class News Services is 0.9592228018121353


 29%|███████████████████████▋                                                          | 28/97 [04:48<11:49, 10.29s/it]

CV score for class File Sharing is 0.9292427613085691


 30%|████████████████████████▌                                                         | 29/97 [04:59<11:42, 10.33s/it]

CV score for class Health is 0.9618178793256432


 31%|█████████████████████████▎                                                        | 30/97 [05:10<11:34, 10.36s/it]

CV score for class Science is 0.9658190642856309


 32%|██████████████████████████▏                                                       | 31/97 [05:23<11:27, 10.42s/it]

CV score for class Photos is 0.9675365474066054


 33%|███████████████████████████                                                       | 32/97 [05:34<11:18, 10.44s/it]

CV score for class Application Development is 0.8323616167314674


 34%|███████████████████████████▉                                                      | 33/97 [05:45<11:10, 10.47s/it]

CV score for class Enterprise is 0.861347539971879


 35%|████████████████████████████▋                                                     | 34/97 [05:57<11:02, 10.52s/it]

CV score for class Events is 0.9496169352045337


 36%|█████████████████████████████▌                                                    | 35/97 [06:07<10:50, 10.50s/it]

CV score for class Marketing is 0.9398496507719964


 37%|██████████████████████████████▍                                                   | 36/97 [06:17<10:40, 10.50s/it]

CV score for class Project Management is 0.9500236794344906


 38%|███████████████████████████████▎                                                  | 37/97 [06:28<10:29, 10.50s/it]

CV score for class Customer Relationship Management is 0.9260255855123436


 39%|████████████████████████████████                                                  | 38/97 [06:39<10:20, 10.52s/it]

CV score for class Management is 0.8464025104793804


 40%|████████████████████████████████▉                                                 | 39/97 [06:51<10:11, 10.55s/it]

CV score for class Search is 0.9128935241329129


 41%|█████████████████████████████████▊                                                | 40/97 [07:01<10:01, 10.55s/it]

CV score for class Messaging is 0.9764708477817766


 42%|██████████████████████████████████▋                                               | 41/97 [07:12<09:50, 10.55s/it]

CV score for class Telephony is 0.9578367092002527


 43%|███████████████████████████████████▌                                              | 42/97 [07:23<09:41, 10.57s/it]

CV score for class Text is 0.9584935872593608


 44%|████████████████████████████████████▎                                             | 43/97 [07:32<09:28, 10.53s/it]

CV score for class Verification is 0.9642992948947775


 45%|█████████████████████████████████████▏                                            | 44/97 [07:43<09:18, 10.54s/it]

CV score for class Voice is 0.9748252092565238


 46%|██████████████████████████████████████                                            | 45/97 [07:54<09:08, 10.54s/it]

CV score for class Conversions is 0.9470929903703483


 47%|██████████████████████████████████████▉                                           | 46/97 [08:04<08:57, 10.54s/it]

CV score for class Social is 0.9071280179718969


 48%|███████████████████████████████████████▋                                          | 47/97 [08:14<08:46, 10.53s/it]

CV score for class Backend is 0.8771751191502822


 49%|████████████████████████████████████████▌                                         | 48/97 [08:25<08:35, 10.52s/it]

CV score for class Storage is 0.9764997579068141


 51%|█████████████████████████████████████████▍                                        | 49/97 [08:35<08:24, 10.51s/it]

CV score for class Collaboration is 0.9277687335658987


 52%|██████████████████████████████████████████▎                                       | 50/97 [08:45<08:14, 10.51s/it]

CV score for class Media is 0.8917800094249175


 53%|███████████████████████████████████████████                                       | 51/97 [08:55<08:03, 10.51s/it]

CV score for class Government is 0.9536466773632194


 54%|███████████████████████████████████████████▉                                      | 52/97 [09:05<07:52, 10.50s/it]

CV score for class Financial is 0.9451619698556337


 55%|████████████████████████████████████████████▊                                     | 53/97 [09:16<07:42, 10.51s/it]

CV score for class Currency is 0.9746125947906363


 56%|█████████████████████████████████████████████▋                                    | 54/97 [09:26<07:31, 10.50s/it]

CV score for class Security is 0.9039780109945027


 57%|██████████████████████████████████████████████▍                                   | 55/97 [09:37<07:20, 10.49s/it]

CV score for class Payments is 0.9716113615792845


 58%|███████████████████████████████████████████████▎                                  | 56/97 [09:47<07:10, 10.49s/it]

CV score for class Business is 0.8256584629589249


 59%|████████████████████████████████████████████████▏                                 | 57/97 [09:57<06:59, 10.48s/it]

CV score for class Email is 0.9715591254439055


 60%|█████████████████████████████████████████████████                                 | 58/97 [10:06<06:48, 10.46s/it]

CV score for class Web Site Management is 0.9098919038214645


 61%|█████████████████████████████████████████████████▉                                | 59/97 [10:17<06:37, 10.47s/it]

CV score for class Analytics is 0.9050048843481354


 62%|██████████████████████████████████████████████████▋                               | 60/97 [10:27<06:27, 10.46s/it]

CV score for class API is 0.7592553033028802


 63%|███████████████████████████████████████████████████▌                              | 61/97 [10:38<06:16, 10.46s/it]

CV score for class Applications is 0.8716798233807826


 64%|████████████████████████████████████████████████████▍                             | 62/97 [10:48<06:06, 10.46s/it]

CV score for class Big Data is 0.8604860164259821


 65%|█████████████████████████████████████████████████████▎                            | 63/97 [10:59<05:55, 10.47s/it]

CV score for class Content is 0.9051047122159656


 66%|██████████████████████████████████████████████████████                            | 64/97 [11:09<05:45, 10.46s/it]

CV score for class Hosting is 0.9556912043649444


 67%|██████████████████████████████████████████████████████▉                           | 65/97 [11:19<05:34, 10.45s/it]

CV score for class Music is 0.9721713276890709


 68%|███████████████████████████████████████████████████████▊                          | 66/97 [11:29<05:23, 10.44s/it]

CV score for class Recommendations is 0.9419796188445874


 69%|████████████████████████████████████████████████████████▋                         | 67/97 [11:39<05:13, 10.45s/it]

CV score for class Statistics is 0.9029767660553029


 70%|█████████████████████████████████████████████████████████▍                        | 68/97 [11:50<05:03, 10.45s/it]

CV score for class Streaming is 0.9562745062751068


 71%|██████████████████████████████████████████████████████████▎                       | 69/97 [12:00<04:52, 10.45s/it]

CV score for class Office is 0.8772218193941886


 72%|███████████████████████████████████████████████████████████▏                      | 70/97 [12:11<04:42, 10.46s/it]

CV score for class Domains is 0.9859577887746901


 73%|████████████████████████████████████████████████████████████                      | 71/97 [12:22<04:31, 10.45s/it]

CV score for class Booking is 0.9851620642181681


 74%|████████████████████████████████████████████████████████████▊                     | 72/97 [12:32<04:21, 10.45s/it]

CV score for class Transportation is 0.9499193042882986


 75%|█████████████████████████████████████████████████████████████▋                    | 73/97 [12:42<04:10, 10.45s/it]

CV score for class Banking is 0.9227168877401817


 76%|██████████████████████████████████████████████████████████████▌                   | 74/97 [12:53<04:00, 10.46s/it]

CV score for class Stocks is 0.9772241287138175


 77%|███████████████████████████████████████████████████████████████▍                  | 75/97 [13:03<03:49, 10.45s/it]

CV score for class Documents is 0.9490561565752946


 78%|████████████████████████████████████████████████████████████████▏                 | 76/97 [13:13<03:39, 10.44s/it]

CV score for class Bitcoin is 0.9973184851497362


 79%|█████████████████████████████████████████████████████████████████                 | 77/97 [13:23<03:28, 10.44s/it]

CV score for class Merchants is 0.929476928665624


 80%|█████████████████████████████████████████████████████████████████▉                | 78/97 [13:34<03:18, 10.44s/it]

CV score for class Sales is 0.8840014157113149


 81%|██████████████████████████████████████████████████████████████████▊               | 79/97 [13:44<03:07, 10.44s/it]

CV score for class Feeds is 0.9511125234944536


 82%|███████████████████████████████████████████████████████████████████▋              | 80/97 [13:56<02:57, 10.45s/it]

CV score for class Semantics is 0.957785464298564


 84%|████████████████████████████████████████████████████████████████████▍             | 81/97 [14:07<02:47, 10.46s/it]

CV score for class Credit Cards is 0.9822797526989158


 85%|█████████████████████████████████████████████████████████████████████▎            | 82/97 [14:17<02:36, 10.46s/it]

CV score for class Advertising is 0.92864545390964


 86%|██████████████████████████████████████████████████████████████████████▏           | 83/97 [14:29<02:26, 10.47s/it]

CV score for class Medical is 0.9642279155731042


 87%|███████████████████████████████████████████████████████████████████████           | 84/97 [14:39<02:16, 10.47s/it]

CV score for class Sports is 0.9614175413452486


 88%|███████████████████████████████████████████████████████████████████████▊          | 85/97 [14:49<02:05, 10.47s/it]

CV score for class Addresses is 0.9600126951975047


 89%|████████████████████████████████████████████████████████████████████████▋         | 86/97 [14:59<01:55, 10.46s/it]

CV score for class Weather is 0.981405659085186


 90%|█████████████████████████████████████████████████████████████████████████▌        | 87/97 [15:09<01:44, 10.46s/it]

CV score for class Food is 0.9190950984499371


 91%|██████████████████████████████████████████████████████████████████████████▍       | 88/97 [15:20<01:34, 10.46s/it]

CV score for class Reference is 0.8855639529715166


 92%|███████████████████████████████████████████████████████████████████████████▏      | 89/97 [15:33<01:23, 10.49s/it]

CV score for class England is 0.8815202668745462


 93%|████████████████████████████████████████████████████████████████████████████      | 90/97 [15:43<01:13, 10.49s/it]

CV score for class Time is 0.8902459910353039


 94%|████████████████████████████████████████████████████████████████████████████▉     | 91/97 [15:53<01:02, 10.48s/it]

CV score for class Jobs is 0.936823616451357


 95%|█████████████████████████████████████████████████████████████████████████████▊    | 92/97 [16:03<00:52, 10.48s/it]

CV score for class Chat is 0.976170260971847


 96%|██████████████████████████████████████████████████████████████████████████████▌   | 93/97 [16:13<00:41, 10.47s/it]

CV score for class Blogging is 0.9602991894070607


 97%|███████████████████████████████████████████████████████████████████████████████▍  | 94/97 [16:23<00:31, 10.46s/it]

CV score for class Auto is 0.8514970767351727


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 95/97 [16:33<00:20, 10.45s/it]

CV score for class Language is 0.9601633049332001


 99%|█████████████████████████████████████████████████████████████████████████████████▏| 96/97 [16:42<00:10, 10.44s/it]

CV score for class Translation is 0.9841853375458739


100%|██████████████████████████████████████████████████████████████████████████████████| 97/97 [16:51<00:00, 10.43s/it]


In [62]:
submission

Unnamed: 0,id,Mapping,Other,Data,Tools,Design,eCommerce,Authentication,Education,Games,...,Food,Reference,England,Time,Jobs,Chat,Blogging,Auto,Language,Translation
12648,12648,0.059502,0.016300,0.069075,0.184158,0.010389,0.040255,0.011137,0.020989,0.019257,...,0.009184,0.024092,0.013664,0.068783,0.010350,0.012917,0.011517,0.014190,0.020858,0.010894
7431,7431,0.043236,0.012561,0.099321,0.092043,0.009730,0.054804,0.011015,0.023659,0.023198,...,0.010060,0.044418,0.014443,0.025039,0.009756,0.009785,0.011265,0.017856,0.021026,0.008876
7707,7707,0.045695,0.015220,0.072321,0.212006,0.012010,0.041631,0.011339,0.024081,0.019779,...,0.009700,0.034563,0.014028,0.021239,0.009491,0.020062,0.013947,0.015876,0.027506,0.009500
3028,3028,0.039052,0.012363,0.181918,0.041233,0.008606,0.056111,0.009688,0.021292,0.068832,...,0.009558,0.041040,0.016065,0.039505,0.009902,0.010315,0.010090,0.021476,0.016918,0.008120
3130,3130,0.044174,0.014767,0.065899,0.079825,0.010228,0.078384,0.012327,0.021850,0.019878,...,0.010541,0.034765,0.014030,0.023662,0.010546,0.010133,0.010483,0.018398,0.020043,0.009998
8544,8544,0.057848,0.015260,0.207436,0.081906,0.010276,0.107057,0.011117,0.027027,0.021142,...,0.012621,0.105966,0.018663,0.020650,0.009479,0.009626,0.011008,0.018625,0.017722,0.008505
10118,10118,0.042728,0.014360,0.082552,0.134707,0.010507,0.046576,0.012218,0.028602,0.021358,...,0.010588,0.056406,0.015187,0.019918,0.010334,0.009952,0.012094,0.015939,0.023266,0.009544
9102,9102,0.040676,0.013522,0.109495,0.061912,0.010939,0.110587,0.010455,0.023828,0.019467,...,0.010069,0.048789,0.017624,0.023877,0.009535,0.010882,0.011468,0.020430,0.019241,0.008671
5040,5040,0.045315,0.016867,0.089092,0.152474,0.011858,0.042891,0.012122,0.024121,0.019690,...,0.009232,0.031391,0.012978,0.020915,0.009430,0.012078,0.013444,0.017971,0.032514,0.010745
296,296,0.037780,0.013869,0.108852,0.093276,0.010625,0.052787,0.019808,0.023031,0.020817,...,0.008940,0.026124,0.012333,0.021877,0.010346,0.011770,0.011097,0.017063,0.019753,0.008916


In [63]:
gt_labels = dict()
for cla in tqdm(class_names2):
    clist = list()
    for idx, row in test.iterrows():
        if row['tags2'].find(cla) == -1:
            clist.append(0)
        else:
            clist.append(1)
    gt_labels[cla] = clist

100%|██████████████████████████████████████████████████████████████████████████████████| 97/97 [00:15<00:00,  6.24it/s]


In [64]:
mAP = list()
for cla in tqdm(class_names2):
    ap = sm.average_precision_score(gt_labels[cla],submission[cla])
    mAP.append(ap)
np.mean(mAP)

100%|█████████████████████████████████████████████████████████████████████████████████| 97/97 [00:00<00:00, 372.64it/s]


0.5550490386566358

In [None]:

# MAP
# 0.501658837537489  -- 大于50 的 178个tag 
# 0.5550490386566358 -- 大于100 的 97个tag